Web Scrapper for Job Search Platform not Scrapping Content

Web Scrapper for Job Search Platform not Scrapping Content - python

I am trying to scrape data from a job searching platform called Job Street. The web crawler works but the generated csv file is empty with no data. The expected output should be a list of jobs with the job title, description, etc.
Below is my code. I am performing this task using selenium. I would really appreciate the help. Thank you in advanced.
headers = {'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) (KHTML, like Gecko) Chrome/102.0.5005.61 '}
path = "/Users/Downloads/jobstreet-scraper-main/chromedriver"
driver = Chrome(executable_path=path)
time.sleep(2)
base_url = "https://www.jobstreet.com.my/en/job-search/{}-jobs/{}/"
def get_page_number(keyword):
#input: keyword for job_postings
#output: number of pages
url = base_url.format(keyword, 1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
result_text = soup.find("span",{"class": "sx2jih0 zcydq84u _18qlyvc0 _18qlyvc1x _18qlyvc1 _1d0g9qk4 _18qlyvc8"})
results = result_text.text.split()
result = result_text.text.split()[-2]
result1 = result.replace(',','')
result2 = int(result1)
page_number = math.ceil(result2/30)
return page_number
def job_page_scraper(link):
url = "https://www.jobstreet.com.my"+link
print("scraping...", url)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
scripts = soup.find_all("script")
for script in scripts:
if script.contents:
txt = script.contents[0].strip()
if 'window.REDUX_STATE = ' in txt:
jsonStr = script.contents[0].strip()
jsonStr = jsonStr.split('window.REDUX_STATE = ')[1].strip()
jsonStr = jsonStr.split('}}}};')[0].strip()
jsonStr = jsonStr+"}}}}"
jsonObj = json.loads(jsonStr)
job = jsonObj['details']
job_id = job['id']
job_expired = job['isExpired']
job_confidential = job['isConfidential']
job_salary_min = job['header']['salary']['min']
job_salary_max = job['header']['salary']['max']
job_salary_currency = job['header']['salary']['currency']
job_title = job['header']['jobTitle']
company = job['header']['company']['name']
job_post_date = job['header']['postedDate']
job_internship = job['header']['isInternship']
company_website = job['companyDetail']['companyWebsite']
company_avgProcessTime = job['companyDetail']['companySnapshot']['avgProcessTime']
company_registrationNo = job['companyDetail']['companySnapshot']['registrationNo']
company_workingHours = job['companyDetail']['companySnapshot']['workingHours']
company_facebook = job['companyDetail']['companySnapshot']['facebook']
company_size = job['companyDetail']['companySnapshot']['size']
company_dressCode = job['companyDetail']['companySnapshot']['dressCode']
company_nearbyLocations = job['companyDetail']['companySnapshot']['nearbyLocations']
company_overview = job['companyDetail']['companyOverview']['html']
job_description = job['jobDetail']['jobDescription']['html']
job_summary = job['jobDetail']['summary']
job_requirement_career_level = job['jobDetail']['jobRequirement']['careerLevel']
job_requirement_yearsOfExperience = job['jobDetail']['jobRequirement']['yearsOfExperience']
job_requirement_qualification = job['jobDetail']['jobRequirement']['qualification']
job_requirement_fieldOfStudy = job['jobDetail']['jobRequirement']['fieldOfStudy']
#job_requirement_industry = job['jobDetail']['jobRequirement']['industryValue']['label']
job_requirement_skill = job['jobDetail']['jobRequirement']['skills']
job_employment_type = job['jobDetail']['jobRequirement']['employmentType']
job_languages = job['jobDetail']['jobRequirement']['languages']
job_benefits = job['jobDetail']['jobRequirement']['benefits']
job_apply_url = job['applyUrl']['url']
job_location_zipcode = job['location'][0]['locationId']
job_location = job['location'][0]['location']
job_country = job['sourceCountry']
return [job_id, job_title, job_expired, job_confidential, job_salary_max, job_salary_max, job_salary_currency, company, job_post_date, job_internship, company_website, company_avgProcessTime, company_registrationNo, company_workingHours, company_facebook, company_size, company_dressCode, company_nearbyLocations, company_overview, job_description, job_summary, job_requirement_career_level, job_requirement_fieldOfStudy, job_requirement_yearsOfExperience, job_requirement_qualification, job_requirement_skill, job_employment_type, job_languages, job_benefits, job_apply_url, job_location_zipcode, job_location, job_country]
def page_crawler(keyword):
# input: keyword for job postings
# output: dataframe of links scraped from each page
# page number
page_number = get_page_number(keyword)
job_links = []
for n in range(page_number):
print('Loading page {} ...'.format(n+1))
url = base_url.format(keyword, n+1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#extract all job links
links = soup.find_all('a',{'class':'sx2jih0'})
job_links += links
jobs = []
for link in job_links:
job_link = link['href'].strip().split('?', 1)[0]
jobs.append([keyword, job_link] + job_page_scraper(job_link))
result_df = pd.DataFrame(jobs, columns = ['keyword', 'link', 'job_id', 'job_title', 'job_expired', 'job_confidential', 'job_salary_max', 'job_salary_max', 'job_salary_currency', 'company', 'job_post_date', 'job_internship', 'company_website', 'company_avgProcessTime', 'company_registrationNo', 'company_workingHours', 'company_facebook', 'company_size', 'company_dressCode', 'company_nearbyLocations', 'company_overview', 'job_description', 'job_summary', 'job_requirement_career_level', 'job_requirement_fieldOfStudy', 'job_requirement_yearsOfExperience', 'job_requirement_qualification', 'job_requirement_skill', 'job_employment_type', 'job_languages', 'job_benefits', 'job_apply_url', 'job_location_zipcode', 'job_location', 'job_country'])
return result_df
def main():
# a list of job roles to be crawled
key_words = ['medical doctor']
dfs = []
for key in key_words:
key_df = page_crawler(key)
dfs.append(key_df)
# save scraped information as csv
pd.concat(dfs).to_csv("job_postings_results.csv")
if __name__ == '__main__':
main()

Related

youtube scraper requests & urllib.parse

trying to get sub count for each video channel I get but don't know why it's not working
class YoutubeSearch:
def __init__(self, search_terms: str, max_results=None):
self.search_terms = search_terms
self.max_results = max_results
self.videos = self._search()
def _search(self):
encoded_search = urllib.parse.quote_plus(self.search_terms)
BASE_URL = "https://youtube.com"
url = f"{BASE_URL}/results?search_query={encoded_search}"
response = requests.get(url).text
while "ytInitialData" not in response:
response = requests.get(url).text
results = self._parse_html(response)
if self.max_results is not None and len(results) > self.max_results:
return results[: self.max_results]
return results
def _parse_html(self, response):
results = []
start = (
response.index("ytInitialData")
+ len("ytInitialData")
+ 3
)
end = response.index("};", start) + 1
json_str = response[start:end]
data = json.loads(json_str)
videos = data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"][
"sectionListRenderer"
]["contents"][0]["itemSectionRenderer"]["contents"]
# get subscriberCountText from each video
for video in videos:
res = {}
if "videoRenderer" in video.keys():
video_data = video.get("videoRenderer", {})
#res["id"] = video_data.get("videoId", None)
res["title"] = video_data.get("title", {}).get("runs", [[{}]])[0].get("text", None)
#res["channel"] = video_data.get("longBylineText", {}).get("runs", [[{}]])[0].get("text", None)
res["url_suffix"] = video_data.get("navigationEndpoint", {}).get("commandMetadata", {}).get("webCommandMetadata", {}).get("url", None)
res["subscribers"] = video_data.get("subscriberCountText", {}).get("simpleText", 0)
results.append(res)
return results
def to_json(self, clear_cache=True):
result = json.dumps({"videos": self.videos})
if clear_cache:
self.videos = ""
return result
Everthing works except sub count and I know it's semantic error
in
res["subscribers"] = video_data.get("subscriberCountText", {}).get("simpleText", 0)
for subscriberCountText i think i should get each channel data i get but how?

how to dump a json dict completely? (so far just dumpt last page )

I couldn't dump the json dict completly
I just could dump the json dict last page. Help me,please.
the coding showed below:
def job_list(url):
htmlFile = requests.get(url)
objSoup = bs4.BeautifulSoup(htmlFile.text,'lxml')
jobs = objSoup.find_all('article',class_='js-job-item')
job_list = []
for job in jobs:
cust_name = job.get('data-cust-name')
print("公司名稱:",cust_name)
job_name = job.get('data-job-name')
print("職稱名稱:",job_name)
d = [('公司名稱',cust_name),('職務名稱', job_name)]
j_dict = dict(d)
job_list.append(j_dict)
url_H = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=藥師&order=1&asc=0&page='
url_T = '&mode=s&jobsource=2021indexpoc'
page_total = 2
for page in range(page_total):
url = url_H+str(page+1)+url_T
job_list(url)
print('-'*70)
time.sleep(random.randint(3,5))
myjob = {'Job':job_list}
fn = '104爬蟲.json'
with open(fn, "w") as fnObj:
json.dump(myjob,fnObj,indent=2,ensure_ascii=False)

Try this code
jobs_to_dump = [] #### added
def job_list(url):
htmlFile = requests.get(url)
objSoup = bs4.BeautifulSoup(htmlFile.text, 'lxml')
jobs = objSoup.find_all('article', class_='js-job-item')
job_list = []
for job in jobs:
cust_name = job.get('data-cust-name')
print("公司名稱:", cust_name)
job_name = job.get('data-job-name')
print("職稱名稱:", job_name)
d = [('公司名稱', cust_name), ('職務名稱', job_name)]
j_dict = dict(d)
jobs_to_dump.append(j_dict) ###modified
url_H = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=藥師&order=1&asc=0&page='
url_T = '&mode=s&jobsource=2021indexpoc'
page_total = 2
for page in range(page_total):
url = url_H + str(page + 1) + url_T
job_list(url)
print('-' * 70)
time.sleep(random.randint(3, 5))
myjob = {'Job': jobs_to_dump} #### modified
fn = '104爬蟲.json'
with open(fn, "w") as fnObj:
json.dump(myjob, fnObj, indent=2, ensure_ascii=False)

BeautifulSoup find.all() web scraping returns empty

When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.
I've used similar code to scrape other websites, so why does this code not work correctly?
Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from googletrans import Translator
translator = Translator()
rg = []
ctr_n = []
ctr = []
yr = []
mn = []
sub = []
cst_n = []
cst = []
mag = []
pty_n = []
pty = []
can = []
pev1 = []
vot1 = []
vv1 = []
ivv1 = []
to1 = []
cv1 = []
cvs1 = []
pv1 = []
pvs1 = []
pev2 = []
vot2 = []
vv2 = []
ivv2 = []
to2 = []
cv2 = []
cvs2 =[]
pv2 = []
pvs2 = []
seat = []
no_info = []
manual = []
START_PAGE = 1
END_PAGE = 42
for page in range(START_PAGE, END_PAGE + 1):
page = requests.get("https://sejmsenat2019.pkw.gov.pl/sejmsenat2019/en/wyniki/sejm/okr/" + str(page))
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
tbody = soup.find_all('table', class_='table table-borderd table-striped table-hover dataTable no-footer clickable right2 right4')
sleep(randint(2,10))
for container in tbody:
col1 = container.find_all('tr', {'data-id':'26079'})
for info in col1:
col_1 = info.find_all('td')
for data in col_1:
party = data[0]
party_trans = translator.translate(party)
pty_n.append(party_trans)
pvotes = data[1]
pv1.append(pvotes)
pshare = data[2]
pvs1.append(pshare)
mandates = data[3]
seat.append(mandates)
col2 = container.find_all('tr', {'data-id':'26075'})
for info in col2:
col_2 = info.find_all('td')
for data in col_2:
party2 = data[0]
party_trans2 = translator.translate(party2)
pty_n.append(party_trans2)
pvotes2 = data[1]
pv1.append(pvotes2)
pshare2 = data[2]
pvs1.append(pshare2)
mandates2 = data[3]
seat.append(mandates2)
col3 = container.find_all('tr', {'data-id':'26063'})
for info in col3:
col_3 = info.find_all('td')
for data in col_3:
party3 = data[0].text
party_trans3 = translator.translate(party3)
pty_n.extend(party_trans3)
pvotes3 = data[1].text
pv1.extend(pvotes3)
pshare3 = data[2].text
pvs1.extend(pshare3)
mandates3 = data[3].text
seat.extend(mandates3)
col4 = container.find_all('tr', {'data-id':'26091'})
for info in col4:
col_4 = info.find_all('td',recursive=True)
for data in col_4:
party4 = data[0]
party_trans4 = translator.translate(party4)
pty_n.extend(party_trans4)
pvotes4 = data[1]
pv1.extend(pvotes4)
pshare4 = data[2]
pvs1.extend(pshare4)
mandates4 = data[3]
seat.extend(mandates4)
col5 = container.find_all('tr', {'data-id':'26073'})
for info in col5:
col_5 = info.find_all('td')
for data in col_5:
party5 = data[0]
party_trans5 = translator.translate(party5)
pty_n.extend(party_trans5)
pvotes5 = data[1]
pv1.extend(pvotes5)
pshare5 = data[2]
pvs1.extend(pshare5)
mandates5 = data[3]
seat.extend(mandates5)
col6 = container.find_all('tr', {'data-id':'26080'})
for info in col6:
col_6 = info.find_all('td')
for data in col_6:
party6 = data[0]
party_trans6 = translator.translate(party6)
pty_n.extend(party_trans6)
pvotes6 = data[1]
pv1.extend(pvotes6)
pshare6 = data[2]
pvs1.extend(pshare6)
mandates6 = data[3]
seat.extend(mandates6)
#### TOTAL VOTES ####
tfoot = soup.find_all('tfoot')
for data in tfoot:
fvote = data.find_all('td')
for info in fvote:
votefinal = info.find(text=True).get_text()
fvoteindiv = [votefinal]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
#### CONSTITUENCY NAMES ####
constit = soup.find_all('a', class_='btn btn-link last')
for data in constit:
names = data.get_text()
names_clean = names.replace("Sejum Constituency no.","")
names_clean2 = names_clean.replace("[","")
names_clean3 = names_clean2.replace("]","")
namesfinal = names_clean3.split()[1]
constitindiv = [namesfinal]
constitlist = constitindiv * (len(pty_n) - len(cst_n))
cst_n.extend(constitlist)
#### UNSCRAPABLE INFO ####
region = 'Europe'
reg2 = [region]
reglist = reg2 * (len(pty_n) - len(rg))
rg.extend(reglist)
country = 'Poland'
ctr2 = [country]
ctrlist = ctr2 * (len(pty_n) - len(ctr_n))
ctr_n.extend(ctrlist)
year = '2019'
yr2 = [year]
yrlist = yr2 * (len(pty_n) - len(yr))
yr.extend(yrlist)
month = '10'
mo2 = [month]
molist = mo2 * (len(pty_n) - len(mn))
mn.extend(molist)
codes = ''
codes2 = [codes]
codeslist = codes2 * (len(pty_n) - len(manual))
manual.extend(codeslist)
noinfo = '-990'
noinfo2 = [noinfo]
noinfolist = noinfo2 * (len(pty_n) - len(no_info))
no_info.extend(noinfolist)
print(len(rg), len(pty_n), len(pv1), len(pvs1), len(no_info), len(vot1), len(cst_n))
poland19 = pd.DataFrame({
'rg' : rg,
'ctr_n' : ctr_n,
'ctr': manual,
'yr' : yr,
'mn' : mn,
'sub' : manual,
'cst_n': cst_n,
'cst' : manual,
'mag': manual,
'pty_n': pty_n,
'pty': manual,
'can': can,
'pev1': no_info,
'vot1': vot1,
'vv1': vot1,
'ivv1': no_info,
'to1': no_info,
'cv1': no_info,
'cvs1': no_info,
'pv1': cv1,
'pvs1': cvs1,
'pev2': no_info,
'vot2': no_info,
'vv2': no_info,
'ivv2': no_info,
'to2': no_info,
'cv2': no_info,
'cvs2': no_info,
'pv2' : no_info,
'pvs2' : no_info,
'seat' : manual
})
print(poland19)
poland19.to_csv('poland_19.csv')

As commented you probably need to use Selenium. You could replace the requests lib and replace the request statements with sth like this:
from selenium import webdriver
wd = webdriver.Chrome('pathToChromeDriver') # or any other Browser driver
wd.get(url) # instead of requests.get()
soup = BeautifulSoup(wd.page_source, 'html.parser')
You need to follow the instructions to install and implement the selenium lib at this link: https://selenium-python.readthedocs.io/
Note: I tested your code with selenium and I was able to get the table that you were looking for, but with the class_=... does not work for some reason.
Instead browsing at the scraped data I found that it has an attribute id. So maybe try also this instead:
tbody = soup.find_all('table', id="DataTables_Table_0")
And again, by doing the get requests with the selenium lib.
Hope that was helpful :)
Cheers

Python requests.get() loop returns nothing

When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.
I've used similar code to scrape other websites, so why does this code not work correctly?
Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.
for page in range(100, 350):
page = requests.get("https://www.ghanaweb.com/GhanaHomePage/election2012/parliament.constituency.php?ID=" + str(page) + "&res=pm")
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
ghana_tbody = soup.find_all('tbody')
sleep(randint(2,10))
for container in ghana_tbody:
#### CANDIDATES ####
candidate = container.find_all('div', class_='can par')
for data in candidate:
cand = data.find('h4')
for info in cand:
if cand is not None:
can2 = info.get_text()
can.append(can2)
#### PARTY NAMES ####
partyn = container.find_all('h5')
for data in partyn:
if partyn is not None:
partyn2 = data.get_text()
pty_n.append(partyn2)
#### CANDIDATE VOTES ####
votec = container.find_all('td', class_='votes')
for data in votec:
if votec is not None:
votec2 = data.get_text()
cv1.append(votec2)
#### CANDIDATE VOTE SHARE ####
cansh = container.find_all('td', class_='percent')
for data in cansh:
if cansh is not None:
cansh2 = data.get_text()
cvs1.append(cansh2)
#### TOTAL VOTES ####`
tfoot = soup.find_all('tr', class_='total')
for footer in tfoot:
fvote = footer.find_all('td', class_='votes')
for data in fvote:
if fvote is not None:
fvote2 = data.get_text()
fvoteindiv = [fvote2]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
Thanks in advance for your help!

I've made some simplification changes. The major changes that needed to be changed were:
ghana_tbody = soup.find_all('table', class_='canResults')
can2 = info # not info.get_text()
I have only tested this against page 112; life is too short.
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep
can = []
pty_n = []
cv1 = []
cvs1 = []
vot1 = []
START_PAGE = 112
END_PAGE = 112
for page in range(START_PAGE, END_PAGE + 1):
page = requests.get("https://www.ghanaweb.com/GhanaHomePage/election2012/parliament.constituency.php?ID=112&res=pm")
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
ghana_tbody = soup.find_all('table', class_='canResults')
sleep(randint(2,10))
for container in ghana_tbody:
#### CANDIDATES ####
candidate = container.find_all('div', class_='can par')
for data in candidate:
cand = data.find('h4')
for info in cand:
can2 = info # not info.get_text()
can.append(can2)
#### PARTY NAMES ####
partyn = container.find_all('h5')
for data in partyn:
partyn2 = data.get_text()
pty_n.append(partyn2)
#### CANDIDATE VOTES ####
votec = container.find_all('td', class_='votes')
for data in votec:
votec2 = data.get_text()
cv1.append(votec2)
#### CANDIDATE VOTE SHARE ####
cansh = container.find_all('td', class_='percent')
for data in cansh:
cansh2 = data.get_text()
cvs1.append(cansh2)
#### TOTAL VOTES ####`
tfoot = soup.find_all('tr', class_='total')
for footer in tfoot:
fvote = footer.find_all('td', class_='votes')
for data in fvote:
fvote2 = data.get_text()
fvoteindiv = [fvote2]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
print('can = ', can)
print('pty_n = ', pty_n)
print('cv1 = ', cv1)
print('cvs1 = ', cvs1)
print('vot1 = ', vot1)
Prints:
can = ['Kwadwo Baah Agyemang', 'Daniel Osei', 'Anyang - Kusi Samuel', 'Mary Awusi']
pty_n = ['NPP', 'NDC', 'IND', 'IND']
cv1 = ['14,966', '9,709', '8,648', '969', '34292']
cvs1 = ['43.64', '28.31', '25.22', '2.83', '\xa0']
vot1 = ['34292', '34292', '34292', '34292']
Be sure to first change START_PAGE and END_PAGE to 100 and 350 respecively.

How to fixing scrapy json row to multiple json file

I have created a scrapy crawler to export individual item to a folder called out but I got 58 items from crawler but not getting 58 files. We just found 50 files.
Currently, I am using windows 10 and python 3
# -*- coding: utf-8 -*-
import json
import os
import random
from scrapy import Spider
from scrapy.http import Request
class AndroiddeviceSpider(Spider):
name = 'androiddevice'
allowed_domains = ['androiddevice.info']
start_urls = ['']
def __init__(self,sr_term):
self.start_urls=['https://www.androiddevice.info/devices?search='+sr_term]
def parse(self, response):
print (response.url)
print ('\n')
listings = response.css('a:nth-of-type(2)::attr(href)').extract()
for link in listings:
ac_link = response.urljoin(link)
sum_meta = link.split('/')[-1]
yield Request(ac_link, meta={"sum_meta":sum_meta}, callback=self.parse_p)
# yield scrapy.Request(ac_link, callback=self.parse_p)
# checking_last = response.xpath('//*[contains(text(),"Last")]').xpath('.//#href').extract_first()
# if checking_last:
# checking_last = checking_last.split('?page=')[-1].split('&')[0]
# ran_ = int(checking_last)+1
# if int(checking_last) is not 1:
# for i in range(2, ran_):
# next_p = 'https://www.androiddevice.info/devices?page={}&search=samsung'.format(i)
# n_link = next_p
# yield Request(n_link, callback=self.parse)
def parse_p(self, response):
sum_meta = response.meta['sum_meta']
r = response.url
r = r.split('/')[-2]
sum_meta = r
listings = response.css('th a::attr(href)').extract()
for link in listings:
ac_link = response.urljoin(link)
yield Request(ac_link, callback=self.parse_details)
checking_last = response.xpath('//*[contains(text(),"Last")]').xpath('.//#href').extract_first()
if checking_last:
checking_last = checking_last.split('?page=')[-1].split('&')[0]
ran_ = int(checking_last)+1
if int(checking_last) is not 1:
for i in range(2, ran_):
# next_p = 'https://www.androiddevice.info/devices?page={}&search=samsung'.format(i)
next_p = 'https://www.androiddevice.info/submissions/{}'+'?page={}'.format(sum_meta,i)
n_link = next_p
yield Request(n_link, callback=self.parse_p)
def parse_details(self, response):
url = response.url
print (url)
print ('\n')
item = {}
items = item
timezone_olson_random = [
"America/Indiana/Knox",
"America/Denver",
"America/Kentucky/Monticello",
"America/Detroit",
"America/Indiana/Petersburg",
"America/New_York",
"America/Chicago",
"America/Kentucky/Louisville",
"America/Los_Angeles",
"America/Indianapolis",
]
java_vm_version = response.xpath('//tr//th[contains(text(),"java_vm_version")]//following-sibling::th//pre//text()').extract_first()
ro_product_provider = response.xpath('//tr//th[contains(text(),"ro.product.manufacturer")]//following-sibling::th//pre//text()').extract_first()
ro_product_brand = response.xpath('//tr//th[contains(text(),"ro.product.manufacturer")]//following-sibling::th//pre//text()').extract_first()
ro_product_name = response.xpath('//tr//th[contains(text(),"ro.product.name")]//following-sibling::th//pre//text()').extract_first()
ro_product_model = response.xpath('//tr//th[contains(text(),"ro.product.model")]//following-sibling::th//pre//text()').extract_first()
ro_product_board = response.xpath('//tr//th[contains(text(),"ro.product.board")]//following-sibling::th//pre//text()').extract_first()
ro_build_id = response.xpath('//tr//th[contains(text(),"ro_build_id")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_incremental = response.xpath('//tr//th[contains(text(),"ro_build_version_incremental")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_release = response.xpath('//tr//th[contains(text(),"ro_build_version_release")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_sdk = response.xpath('//tr//th[contains(text(),"ro_build_version_sdk")]//following-sibling::th//pre//text()').extract_first()
timezone_olson = random.choice(timezone_olson_random)
item['java_vm_version'] = java_vm_version
item['ro_product_provider'] = ro_product_provider
item['ro_product_brand'] = ro_product_brand
item['ro_product_name'] = ro_product_name
item['ro_product_model'] = ro_product_model
item['ro_product_board'] = ro_product_board
item['ro_build_id'] = ro_build_id
item['ro_build_version_incremental'] = ro_build_version_incremental
item['ro_build_version_release'] = ro_build_version_release
item['ro_build_version_sdk'] = ro_build_version_sdk
item['timezone_olson'] = timezone_olson
formatted_json = json.dumps(items, indent = 4,sort_keys=True)
with open(os.path.join('out', ro_product_model+".json"), "w") as f:
f.write(formatted_json)
yield item
I expect the output files would be 58 items to 58 .json files into my out folder
Thank you,
Palash

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web Scrapper for Job Search Platform not Scrapping Content - python

Related

youtube scraper requests & urllib.parse

how to dump a json dict completely? (so far just dumpt last page )

BeautifulSoup find.all() web scraping returns empty

Python requests.get() loop returns nothing

How to fixing scrapy json row to multiple json file

Categories

Resources