I'm trying to Scrape LinkedInn for Job Listings. Unfortunately after each run I'm getting the same line repeatedly instead of all the listings. Would anyone know why this might be? I'm fairly new to WebScrapers. I'm not sure if it's my loop that's causing the same result to repeat or if I'm exporting to CSV incorrectly.
`
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import csv
job_name = "Data Analyst"
country_name = "United States"
job_url ="";
for item in job_name.split(" "):
if item != job_name.split(" ")[-1]:
job_url = job_url + item + "%20"
else:
job_url = job_url + item
country_url ="";
for item in country_name.split(" "):
if item != country_name.split(" ")[-1]:
country_url = country_url + item + "%20"
else:
country_url = country_url + item
url = "https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=United%20States&geoId=103644278&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
url.format(job_url,country_url)
# Creating a webdriver instance
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
#Change the path to match the location of your "chromedriver" file
#driver = webdriver.Chrome("/home/im-admin/.scripts/Py/chromedriver")
# Opening the url we have just defined in our browser
driver.get(url)
#We find how many jobs are offered.
jobs_num = driver.find_element(By.CSS_SELECTOR,"h1>span").get_attribute("innerText")
if len(jobs_num.split(',')) > 1:
jobs_num = int(jobs_num.split(',')[0])*2
else:
jobs_num = int(jobs_num)
jobs_num = int(jobs_num)
#We create a while loop to browse all jobs.
i = 2
while i <= int(jobs_num/2)+1:
#We keep scrollind down to the end of the view.
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
i = i + 1
print("Current at: ", i, "Percentage at: ", ((i+1)/(int(jobs_num/2)+1))*100, "%",end="\r")
try:
#We try to click on the load more results buttons in case it is already displayed.
infinite_scroller_button = driver.find_element(By.XPATH, ".//button[#aria-label='Load more results']")
infinite_scroller_button.click()
time.sleep(0.1)
except:
#If there is no button, there will be an error, so we keep scrolling down.
time.sleep(0.1)
pass
#We get a list containing all jobs that we have found.
job_lists = driver.find_element(By.CLASS_NAME,"jobs-search__results-list")
jobs = job_lists.find_elements(By.TAG_NAME,"li") # return a list
#We declare void list to keep track of all obtaind data.
job_title_list = []
company_name_list = []
location_list = []
date_list = []
job_link_list = []
#We loof over every job and obtain all the wanted info.
for job in jobs:
#job_title
job_title = job.find_element(By.CSS_SELECTOR,"h3").get_attribute("innerText")
job_title_list.append(job_title)
#company_name
company_name = job.find_element(By.CSS_SELECTOR,"h4").get_attribute("innerText")
company_name_list.append(company_name)
#location
location = job.find_element(By.CSS_SELECTOR,"div>div>span").get_attribute("innerText")
location_list.append(location)
#date
date = job.find_element(By.CSS_SELECTOR,"div>div>time").get_attribute("datetime")
date_list.append(date)
#job_link
job_link = job.find_element(By.CSS_SELECTOR,"a").get_attribute("href")
job_link_list.append(job_link)
jd = [] #job_description
seniority = []
emp_type = []
job_func = []
job_ind = []
for item in range(len(jobs)):
print(item)
job_func0=[]
industries0=[]
# clicking job to view job details
#__________________________________________________________________________ JOB Link
try:
job_click_path = f'/html/body/div/div/main/section/ul/li[{item+1}]'
job_click = job.find_element(By.XPATH,job_click_path).click()
except:
pass
#job_click = job.find_element(By.XPATH,'.//a[#class="base-card_full-link"]')
#__________________________________________________________________________ JOB Description
jd_path = '/html/body/div/div/section/div/div/section/div/div/section/div'
try:
jd0 = job.find_element(By.XPATH,jd_path).get_attribute('innerText')
jd.append(jd0)
except:
jd.append(None)
pass
#__________________________________________________________________________ JOB Seniority
seniority_path='/html/body/div/div/section/div/div/section/div/ul/li[1]/span'
try:
seniority0 = job.find_element(By.XPATH,seniority_path).get_attribute('innerText')
seniority.append(seniority0)
except:
seniority.append(None)
pass
#__________________________________________________________________________ JOB Time
emp_type_path='/html/body/div/div/section/div/div/section/div/ul/li[2]/span'
try:
emp_type0 = job.find_element(By.XPATH,emp_type_path).get_attribute('innerText')
emp_type.append(emp_type0)
except:
emp_type.append(None)
pass
#__________________________________________________________________________ JOB Function
function_path='/html/body/div/div/section/div/div/section/div/ul/li[3]/span'
try:
func0 = job.find_element(By.XPATH,function_path).get_attribute('innerText')
job_func.append(func0)
except:
job_func.append(None)
pass
#__________________________________________________________________________ JOB Industry
industry_path='/html/body/div/div/section/div/div/section/div/ul/li[4]/span'
try:
ind0 = job.find_element(By.XPATH,industry_path).get_attribute('innerText')
job_ind.append(ind0)
except:
job_ind.append(None)
pass
print("Current at: ", item, "Percentage at: ", (item+1)/len(jobs)*100, "%")
job_data = pd.DataFrame({
'Date': date,
'Company': company_name,
'Title': job_title,
'Location': location,
'Description': jd,
'Level': seniority,
'Type': emp_type,
'Function': job_func,
'Industry': job_ind,
'Link': job_link
})
#Change the path to jobdata.csv if you want it to output to a different folder.
##See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#
job_data.to_csv('jobdata.csv',encoding='utf-8',index=False)
`
This is my output
Date,Company,Title,Location,Description,Level,Type,Function,Industry,Link
2022-10-14,LHH,Data Analyst,"McLean, VA",,,,,,https://www.linkedin.com/jobs/view/data-analyst-at-lhh-3311865718?refId=pAkR2FDOYi8W2HOa%2FLgpiw%3D%3D&trackingId=5%2FX7p1W7L0eCE4XtpbzcEQ%3D%3D&position=23&pageNum=2&trk=public_jobs_jserp-result_search-card
2022-10-14,LHH,Data Analyst,"McLean, VA",,,,,,https://www.linkedin.com/jobs/view/data-analyst-at-lhh-3311865718?refId=pAkR2FDOYi8W2HOa%2FLgpiw%3D%3D&trackingId=5%2FX7p1W7L0eCE4XtpbzcEQ%3D%3D&position=23&pageNum=2&trk=public_jobs_jserp-result_search-card
2022-10-14,LHH,Data Analyst,"McLean, VA",,,,,,https://www.linkedin.com/jobs/view/data-analyst-at-lhh-3311865718?refId=pAkR2FDOYi8W2HOa%2FLgpiw%3D%3D&trackingId=5%2FX7p1W7L0eCE4XtpbzcEQ%3D%3D&position=23&pageNum=2&trk=public_jobs_jserp-result_search-card
I've tried printing the Panda Table directly with no success.
Related
as of right now i have a working code which is a web scraper that logs into indeed job search site. My issue now is tha I need to create a csv file that shows every single job position that was found, it gives me the numer of positions available and the description of one of them. Hope i can get some help, I would greatly apreciate it.
import re
import csv
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
jk_pattern = re.compile(r"jk:\'([a-zA-Z0-9]+)'")
params = { "q": "mechanical+engineer", "l": "united+states", "start": 0 }
url = "https://www.indeed.com/jobs"
job_keys = set()
for x in range(10):
response = requests.get(url, params=params)
if not response.status_code == 200:
break
else:
keys = jk_pattern.findall(response.text)
if len(keys) > 0:
for key in keys:
job_keys.add(key)
params['start'] += 20
sleep(randint(0, 3))
len(job_keys)
template = "https://www.indeed.com/viewjob?jk={}"
jk = job_keys.pop()
job_url = template.format(jk)
response = requests.get(job_url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.find("div", id="jobDescriptionText").text)
def get_record(card):
"""Extract job data from a single record"""
job_title = card.h2.a.get('title')
company = card.find('span', 'company').text.strip()
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
job_url = 'https://www.indeed.com' + card.h2.a.get('href')
# this does not exists for all jobs, so handle the exceptions
salary_tag = card.find('span', 'salaryText')
if salary_tag:
salary = salary_tag.text.strip()
else:
salary = ''
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
return record
def main(position, location):
"""Run the main program routine"""
records = []
url = get_url(position, location)
# extract the job data
while True:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', 'jobsearch-SerpJobCard')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
except AttributeError:
break
# save the job data
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
writer.writerows(records)
please anyone one help me. I'm trying to navigate a pages but I don't know my code is not working. I got a product details for the 1st page I want to scrape the details for all the pages in the website. below is my code please check for your reference. thanks in advance
below is link for the website
https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
class kotsovolosmobiles:
def __init__(self):
self.url='https://www.kotsovolos.gr/mobile-phones-gps/mobile-phones/smartphones?pageSize=60'
self.country='GR'
self.currency='euro'
self.VAT= 'Included'
self.shipping = 'Available for shipment'
self.Pre_PromotionPrice ='N/A'
def kotsovolos(self):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1',cell_overwrite_ok=True)
ws.write(0,0,"Product_Url")
ws.write(0,0,"Product_Manufacturer")
ws.write(0,1,"Product_Url")
ws.write(0,2,"Product_Price")
ws.write(0,3,"Product_Model")
ws.write(0,4,"Memory")
ws.write(0,5,"Currency")
ws.write(0,6,"Color")
ws.write(0,7,"VAT")
ws.write(0,8,"Shipping Cost")
ws.write(0,9,"Pre-PromotionPrice")
ws.write(0,10,"Country")
ws.write(0,11,"Date")
ws.write(0,12,"Raw_Model")
wb.save(r"C:\Users\Karthick R\Desktop\VS code\kotsovolos.xls")
driver=webdriver.Chrome()
driver.get(self.url)
today = date.today()
time.sleep(5)
cookies = driver.find_element_by_css_selector('a[id="CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"]')
cookies.click()
print("cookies accepted")
driver.maximize_window()
time.sleep(5)
titles = []
models = []
memorys = []
prod_prices = []
p_links =[]
p_colors = []
while True:
storage_box = []
storage_box = driver.find_elements_by_css_selector('div[class="product"]')
for storage_boxes in storage_box:
product_url = storage_boxes.find_element_by_css_selector('div[class="title"] a').get_attribute('href')
print(product_url)
p_links.append(product_url)
p_model = storage_boxes.find_element_by_css_selector('div[class="title"] a').text
print(p_model)
models.append(p_model)
manufacturer1 = p_model.split(" ")
print(manufacturer1[0])
titles.append(manufacturer1[0])
memory = []
memory = re.findall('\d+ ?[gG][bB]',p_model)
print(memory)
memory1 = str(memory).replace("['",'').replace("']",'').replace("[]",'').strip()
if "," in memory1:
arr=memory1.split(",")
for str1 in arr:
str2=str1.replace("GB", "").replace("gb", "").replace("'", "").strip()
if len(str2)!=1:
memory_str=str1
break
elif (memory1 == ""):
memory_str ='N/A'
else:
memory_str=memory1
memory_str = memory_str.replace("'", "").strip()
print(memory_str)
memorys.append(memory_str)
colors= []
prod_color = p_model.split(" ")
length = len(prod_color)
indexcolor = length-3
colors.append(prod_color[indexcolor])
color1 = str(colors).replace("['",'').replace("']",'').strip()
print(color1)
p_colors.append(color1)
p_price = storage_boxes.find_element_by_css_selector('.priceWithVat > .price').text
print(p_price)
prod_prices.append(p_price)
next = driver.find_element_by_css_selector('.pagination_next a')
time.sleep(3)
next.click()
print("next page")
time.sleep(3)
kotsovolos_gr = kotsovolosmobiles()
kotsovolos_gr.kotsovolos()
I am a student working on a scraping project and I am having trouble completing my script because it fills my computer's memory with all of the data is stores.
It currently stores all of my data until the end, so my solution to this would be to break up the scrape into smaller bits and then write out the data periodically so it does not just continue to make one big list and then write out at the end.
In order to do this, I would need to stop my scroll method, scrape the loaded profiles, write out the data that I have collected, and then repeat this process without duplicating my data. It would be appreciated if someone could show me how to do this. Thank you for your help :)
Here's my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
driver = webdriver.Chrome()
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
Test.py
Displaying Test.py.
Try the below approach using requests and beautifulsoup. In the below script i have used the API URL fetched from website itself for ex:-API URL
First it will create the URL(refer first url) for first iteration, add headers and data in .csv file.
Second iteration it will again create the URL(refer second url) with 2 extra params start_on_page=20 & show_per_page=20 where start_on_page number 20 is incremented by 20 on each iteration and show_per_page = 100 defaulted to extract 100 records per iteration so on till all the data dumped in to the .csv file.second iteration API URL
Script is dumping 4 things number, name, location and profile url.
On each iteration data will be appended to .csv file , so your memory issue will get resolved by this approach.
Do not forget to add your system path in file_path variable where do you want to create .csv file before running the script.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
import csv
def scrap_directory_data():
list_of_credentials = []
file_path = ''
file_name = 'credential_list.csv'
count = 0
page_number = 0
page_size = 100
create_url = ''
main_url = 'https://directory.bcsp.org/search_results.php?'
first_iteration_url = 'first_name=&last_name=&city=&state=&country=&certification=&unauthorized=0&retired=0&specialties=&industries='
number_of_records = 0
csv_headers = ['#','Name','Location','Profile URL']
while True:
if count == 0:
create_url = main_url + first_iteration_url
print('-' * 100)
print('1 iteration URL created: ' + create_url)
print('-' * 100)
else:
create_url = main_url + 'start_on_page=' + str(page_number) + '&show_per_page=' + str(page_size) + '&' + first_iteration_url
print('-' * 100)
print('Other then first iteration URL created: ' + create_url)
print('-' * 100)
page = requests.get(create_url,verify=False)
extracted_text = bs(page.text, 'lxml')
result = extracted_text.find_all('tr')
if len(result) > 0:
for idx, data in enumerate(result):
if idx > 0:
number_of_records +=1
name = data.contents[1].text
location = data.contents[3].text
profile_url = data.contents[5].contents[0].attrs['href']
list_of_credentials.append({
'#':number_of_records,
'Name':name,
'Location': location,
'Profile URL': profile_url
})
print(data)
with open(file_path + file_name ,'a+') as cred_CSV:
csvwriter = csv.DictWriter(cred_CSV, delimiter=',',lineterminator='\n',fieldnames=csv_headers)
if idx == 0 and count == 0:
print('Writing CSV header now...')
csvwriter.writeheader()
else:
for item in list_of_credentials:
print('Writing data rows now..')
print(item)
csvwriter.writerow(item)
list_of_credentials = []
count +=1
page_number +=20
scrap_directory_data()
Currently, I'm trying to scrape the New York Times dining website (nytimes.com/reviews/dining) and get a list of links and neighborhoods for each restaurant. Unfortunately, I've been running the codeblock below for about 9 hours on Google Colab, and the iterator X is on its 1,175th run. I'm trying to figure out what's going on, but I'm too scared to halt the cell and start over again. Is it the nested for loop that is causing this process to take such a long time?
driver = webdriver.Chrome('chromedriver', chrome_options = chrome_options)
driver.get("https://www.nytimes.com/reviews/dining")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable
((By.XPATH,"//button[text()='Show More']"))).click()
url_list = []
nyt_dining = pd.DataFrame(columns = ['Restaurant', 'URL', 'servesCuisine', 'priceRange', 'addressLocality'])
x = 0
while(True):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elements = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located
((By.CSS_SELECTOR,"div.css-102xbk1")))
url_before = len(url_list)
for e in elements:
# 2.1 Getting the links
link = e.find_element_by_css_selector("a.css-gg4vpm")
link = link.get_attribute("href") ##
# 2.1 Getting the restaurant name
name = e.find_element_by_css_selector("h2.css-8aqwnr")
name = name.text ##
# 2.3 Getting other information
info = e.find_element_by_css_selector("ul.css-o4kdzz")
info = info.find_elements_by_tag_name('li')
cuisine = ''
price = ''
neighborhood = ''
for i in info:
attribute = i.get_attribute("itemprop")
if attribute == "servesCuisine":
cuisine = i.text
elif attribute == "priceRange":
price = i.text
elif attribute == "addressLocality":
neighborhood = i.text
# 2.4 Append to dataframe
if link in url_list:
continue
else:
url_list.append(link)
nyt_dining = nyt_dining.append({'Restaurant': name, 'URL': link,
'servesCuisine': cuisine,
'priceRange': price,
'addressLocality': neighborhood},
ignore_index = True)
print(x)
x += 1
url_after = len(url_list)
if url_before >= url_after:
break
button = WebDriverWait(driver, 10).until(EC.visibility_of_element_located
((By.XPATH,"//button[text()='Show More']")))
driver.execute_script("arguments[0].click();", button)
time.sleep(2)
nyt_dining
I have tried many times, but it does not work:
import requests
from lxml import html, etree
from selenium import webdriver
import time, json
#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'
url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page=1&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'
driver = webdriver.Chrome()
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
i = int(1)
while True:
name_string = '//*[#id="plist"]/ul/li[%d]/div/div[3]/a/em/text()' %(i)
price_string = '//*[#id="plist"]/ul/li[%d]/div/div[2]/strong[1]/i/text()' %(i)
if i == 60:
break
else:
i += 1
name = selctor.xpath(name_string)[0]
name_data.append(name)
price = selctor.xpath(price_string)[0]
price_data.append(price)
jd_goods_data[name] = price
print(name_data)
with open(file_name, 'w') as f:
json.dump(jd_goods_data, f)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="J_bottomPage"]/span[1]/a[10]').click()
time.sleep(2)
# for k, v in jd_goods_data.items():
# print(k,v)
I am trying to download some details, but it doesn't work. If you type 2 to scan, it only downloads one page details, but twice!
Ok, you define q but you do not actually use it as such. In this case, the convention is to name this unused variable as _. I mean, instead of doing
for q in range(page_num):
you should do
for _ in range(page_num):
Thus, other programers will directly know that you do not use q, and only want your operation to be repeated.
Which means that (for some reasons) the line driver.find_element_by_xpath('//*[#id="J_bottomPage"]/span[1]/a[10]').click() does not execute correctly. For sure there is a way to make it work. But in your case, I heuristically see that your url contains a parameter whose name is page. I recommend you to use it instead. Which thus leads to actually use the variable q as such., as follows:
import requests
from lxml import html,etree
from selenium import webdriver
import time, json
#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'
driver = webdriver.Chrome()
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page={page}&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'.format(page=q)
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
i = 1
while True:
name_string = '//*[#id="plist"]/ul/li[%d]/div/div[3]/a/em/text()' %(i)
price_string = '//*[#id="plist"]/ul/li[%d]/div/div[2]/strong[1]/i/text()' %(i)
if i == 60:
break
else:
i += 1
name = selctor.xpath(name_string)[0]
name_data.append(name)
price = selctor.xpath(price_string)[0]
price_data.append(price)
jd_goods_data[name] = price
print(name_data)
with open(file_name, 'w') as f:
json.dump(jd_goods_data, f)
driver.quit()