-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_skills.py
273 lines (229 loc) · 10.6 KB
/
extract_skills.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# scrape indeed job listings to rank given skills in order of most needed
# takes about 1 minute per 10 job listings or 2 minutes per page
# @author: Osamah Mandawi
# @email: [email protected]
"""This is an explanation of the structure of indeed.com
"""
# This is an example of what the first job listings page for software
# engineering in MA looks like: https://www.indeed.com/jobs?q=software+engineer&l=MA&sort=date
# Now, if we look at a single job: https://www.indeed.com/jobs?q=software+engineer&l=MA&sort=date&vjk=3916106ade6d80b3
# Note that this is the same URL as the one before, with only vjk=3916106ade6d80b3, the unique job id, added to it.
# Overall, this means we can replace the text after q= to get results for a different job (with spaces converted to +),
# and replace text after l= with state abbreviation
# ? Must have the following:
# 1. Have pip ready: https://stackoverflow.com/questions/4750806/how-do-i-install-pip-on-windows?rq=1
# * Note, you may already have pip, so check by going to cmd, typing python, and then import pip and you should get no errors, if you have it
# 2. Have selenium ready: https://pypi.org/project/selenium/
# * use: 'pip install selenium' without quotes in cmd
# ? Nice to have the following to get visual results:
# 1. Have easygui ready: https://pypi.org/project/easygui/
# * use: 'pip install easygui' without quotes in cmd
# 2. Have matplotlib ready (this is quite heavy): https://pypi.org/project/matplotlib/
# * use: 'pip install matplotlib' without quotes in cmd
# ! Get the packages to count time program took to run
import time
import datetime
# ! Get the package to create dialog boxes
import easygui
# ! Get the package to visually print results
import matplotlib.pyplot as plt
# ! Get the package to control the web
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
def headless_options():
"""Sets the configurations for the driver. In our case, we add the headless settings because we want the program to crawl in the background
Returns:
options -- the options configurations to be used in the Google Chrome driver
"""
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-extensions")
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
options.add_argument("--start-maximized")
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument('--ignore-certificate-errors')
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
return options
def set_driver_path():
"""Sometimes your driver path is not installed. Other times, you don't know where it is. This installs it, if it's not there, and returns
where it is, when it's there.
Returns:
driver_path -- path of Chrome driver
"""
driver_path = ChromeDriverManager().install()
return driver_path
def set_driver(driver_path):
"""Sets the Chrome driver with the driver path and the headless options
Arguments:
driver_path {string address} -- path of Chrome driver
Returns:
driver -- the Chrome driver to be used for web crawling
"""
driver = webdriver.Chrome(driver_path, options=headless_options())
return driver
def start_gui():
"""Introduce user to the program, and get some information: which field, which state, how many pages of indeed, and which skills.
Returns:
field -- string with what field user wants to look at job listings for
state -- which U.S. state to find job listings in
page_range -- how many pages of indeed job listings to search through
skills -- which skills to look for
counter_dict -- the skills dictionary with how many jobs each skill appears in
search_url_master -- a generated url with the field and state chosen by the user sorted by date to reduce duplicates
"""
easygui.msgbox("Welcome to Extract_Skills V.0.1\nWe will be using indeed.com to extract our data.\nNote that we will only be looking at job listings within the U.S.\n")
field = easygui.enterbox(
"What kind of job do you want data on?\n(e.g. software engineering)")
field = field.replace(" ", "+")
state = easygui.enterbox(
"What state in the U.S. are you looking at?\n(e.g. CA)")
pages_range = int(easygui.enterbox(
"How many pages of indeed.com to scrape?"))
skills = easygui.enterbox(
"Enter skills, such as programming languages, to look for, seperated by /\n(e.g. python/sas/sql/java/php/master's degree/bachelor's degree)")
skills = skills.split("/")
skills = list(map(str.lower, skills))
counter_dict = {i: 0 for i in skills}
#! create the search url using the job type and location
search_url_master = 'https://www.indeed.com/jobs?q='+field+'&l='+state+'&sort=date'
print("Skills looking for:", skills)
return field, state, pages_range, skills, counter_dict, search_url_master
def gather_job_listings(pages_range, search_url_master, driver_path):
"""This program gathers all the job listings on as many pages as requested by the user.
Arguments:
pages_range {integer} -- how many pages of indeed job listings to search through
search_url_master {string} -- a generated url with the field and state chosen by the user sorted by date to reduce duplicates
driver_path {string address} -- path of Chrome driver
Returns:
start_time -- time in seconds of when the searching process started
sites -- the urls of all the job listings in a list
"""
start_time = time.time()
print("SEARCH STARTS")
sites = []
for i in range(0, pages_range):
driver = set_driver(driver_path)
#! crawl to the first page of the search
driver.get(
search_url_master+'&start='+str(i*10))
#! get the ids of all the job listings
ids = driver.find_elements_by_xpath('//*[@data-jk]')
jdks = []
for ii in ids:
# print ii.tag_name
jdks.append((ii.get_attribute('data-jk')))
#! combine the ids with the url and save them in a list so then we can go over them job by job
sites.extend(
[search_url_master+'&vjk='+jdk for jdk in jdks])
driver.quit()
#! remove duplicates
sites = list(dict.fromkeys(sites))
print("Amount of job postings found:", len(sites))
return start_time, sites
def count_keywords(txt, counter_dict):
"""Count how many times each skill appears in the text; update counter_dict
Arguments:
txt {string} -- the text description of the job
counter_dict {dictionary} -- the dictionary of skills and how often they appeared in different jobs
Returns:
counter_dict -- the update skills dictionary with how many jobs each skill appears in
"""
txt = txt.split()
for skill in counter_dict:
#! Note that this only counts a skill once even if it appears multiply times in the SAME job description
if txt.count(skill) > 0:
counter_dict[skill] += 1
return counter_dict
def print_results(counter_dict):
"""Print the dictionary of keywords and one how many job listings they have appeared
Arguments:
counter_dict {dictionary} -- the dictionary of skills and how often they appeared in different jobs
"""
for i in counter_dict:
print(i, counter_dict[i])
def skill_count(sites, counter_dict, driver_path):
"""Go listing by listing using the urls of the listings and count in how many listings the skills appear
Arguments:
sites {list} -- sites -- the urls of all the job listings in a list
counter_dict {dictionary} -- the dictionary of skills and how often they appeared in different jobs
driver_path {string address} -- path of Chrome driver
Returns:
counter_dict -- fully updated skills dictionary
end_time -- time in seconds of when all the processes finished
"""
count_sites = 1
for site in sites:
driver = set_driver(driver_path)
driver.get(site)
print("url of curr page", site)
print("job number", count_sites)
element = driver.find_elements_by_id("vjs-desc")
if len(element) > 0:
element = element[0].text.lower()
element = element.replace("\n", " ")
# print(element) if you want to see what the descriptions says
counter_dict = count_keywords(element, counter_dict)
print_results(counter_dict)
driver.quit()
count_sites += 1
end_time = time.time()
return counter_dict, end_time
def end_gui(start_time, end_time, counter_dict, sites):
"""Print findings after the program finishes
"""
print("Time program took to run: " +
str(datetime.timedelta(seconds=(end_time-start_time))))
result = ""
for k in sorted(counter_dict, key=counter_dict.get, reverse=True):
result += (k.capitalize()+" ("+str(counter_dict[k])+")\n")
easygui.msgbox("Our final list of skills across "+str(len(sites)) +
" jobs, sorted from most needed to least:\n"+result+"\nFinished in "+str(datetime.timedelta(seconds=(end_time-start_time))))
def bar_print(field, state, sites, counter_dict):
"""Prints in a nice graph then saves it
Arguments:
field {string}
state {string}
sites {list}
counter_dict {dictionary}
"""
plt.bar(*zip(*counter_dict.items()))
plt.ylabel("Amount of job listing mentions")
plt.xlabel("Skill")
plt.suptitle("For {} {} jobs in {}, U.S.".format(
str(len(sites)), field.replace("+", " "), state))
plt.savefig(str(len(sites))+field+state+".png")
plt.show()
def main():
"""Run everything
"""
input("Press enter to start!")
driver_path = set_driver_path()
field, state, pages_range, _, counter_dict, search_url_master = start_gui()
start_time, sites = gather_job_listings(
pages_range, search_url_master, driver_path)
counter_dict, end_time = skill_count(sites, counter_dict, driver_path)
end_gui(start_time, end_time, counter_dict, sites)
bar_print(field, state, sites, counter_dict)
if __name__ == "__main__":
main()
# ? Example result from 257 job listings of software engineering in CA
# job number 257
# Java 20
# Python 18
# Perl 0
# C++ 15
# C# 13
# Rust 2
# Ruby 3
# VB 0
# MATLAB 3
# PHP 0
# Scala 1
# HTML 5
# CSS 6