How to fixing scrapy json row to multiple json file - python

I have created a scrapy crawler to export individual item to a folder called out but I got 58 items from crawler but not getting 58 files. We just found 50 files.
Currently, I am using windows 10 and python 3
# -*- coding: utf-8 -*-
import json
import os
import random
from scrapy import Spider
from scrapy.http import Request
class AndroiddeviceSpider(Spider):
name = 'androiddevice'
allowed_domains = ['androiddevice.info']
start_urls = ['']
def __init__(self,sr_term):
self.start_urls=['https://www.androiddevice.info/devices?search='+sr_term]
def parse(self, response):
print (response.url)
print ('\n')
listings = response.css('a:nth-of-type(2)::attr(href)').extract()
for link in listings:
ac_link = response.urljoin(link)
sum_meta = link.split('/')[-1]
yield Request(ac_link, meta={"sum_meta":sum_meta}, callback=self.parse_p)
# yield scrapy.Request(ac_link, callback=self.parse_p)
# checking_last = response.xpath('//*[contains(text(),"Last")]').xpath('.//#href').extract_first()
# if checking_last:
# checking_last = checking_last.split('?page=')[-1].split('&')[0]
# ran_ = int(checking_last)+1
# if int(checking_last) is not 1:
# for i in range(2, ran_):
# next_p = 'https://www.androiddevice.info/devices?page={}&search=samsung'.format(i)
# n_link = next_p
# yield Request(n_link, callback=self.parse)
def parse_p(self, response):
sum_meta = response.meta['sum_meta']
r = response.url
r = r.split('/')[-2]
sum_meta = r
listings = response.css('th a::attr(href)').extract()
for link in listings:
ac_link = response.urljoin(link)
yield Request(ac_link, callback=self.parse_details)
checking_last = response.xpath('//*[contains(text(),"Last")]').xpath('.//#href').extract_first()
if checking_last:
checking_last = checking_last.split('?page=')[-1].split('&')[0]
ran_ = int(checking_last)+1
if int(checking_last) is not 1:
for i in range(2, ran_):
# next_p = 'https://www.androiddevice.info/devices?page={}&search=samsung'.format(i)
next_p = 'https://www.androiddevice.info/submissions/{}'+'?page={}'.format(sum_meta,i)
n_link = next_p
yield Request(n_link, callback=self.parse_p)
def parse_details(self, response):
url = response.url
print (url)
print ('\n')
item = {}
items = item
timezone_olson_random = [
"America/Indiana/Knox",
"America/Denver",
"America/Kentucky/Monticello",
"America/Detroit",
"America/Indiana/Petersburg",
"America/New_York",
"America/Chicago",
"America/Kentucky/Louisville",
"America/Los_Angeles",
"America/Indianapolis",
]
java_vm_version = response.xpath('//tr//th[contains(text(),"java_vm_version")]//following-sibling::th//pre//text()').extract_first()
ro_product_provider = response.xpath('//tr//th[contains(text(),"ro.product.manufacturer")]//following-sibling::th//pre//text()').extract_first()
ro_product_brand = response.xpath('//tr//th[contains(text(),"ro.product.manufacturer")]//following-sibling::th//pre//text()').extract_first()
ro_product_name = response.xpath('//tr//th[contains(text(),"ro.product.name")]//following-sibling::th//pre//text()').extract_first()
ro_product_model = response.xpath('//tr//th[contains(text(),"ro.product.model")]//following-sibling::th//pre//text()').extract_first()
ro_product_board = response.xpath('//tr//th[contains(text(),"ro.product.board")]//following-sibling::th//pre//text()').extract_first()
ro_build_id = response.xpath('//tr//th[contains(text(),"ro_build_id")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_incremental = response.xpath('//tr//th[contains(text(),"ro_build_version_incremental")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_release = response.xpath('//tr//th[contains(text(),"ro_build_version_release")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_sdk = response.xpath('//tr//th[contains(text(),"ro_build_version_sdk")]//following-sibling::th//pre//text()').extract_first()
timezone_olson = random.choice(timezone_olson_random)
item['java_vm_version'] = java_vm_version
item['ro_product_provider'] = ro_product_provider
item['ro_product_brand'] = ro_product_brand
item['ro_product_name'] = ro_product_name
item['ro_product_model'] = ro_product_model
item['ro_product_board'] = ro_product_board
item['ro_build_id'] = ro_build_id
item['ro_build_version_incremental'] = ro_build_version_incremental
item['ro_build_version_release'] = ro_build_version_release
item['ro_build_version_sdk'] = ro_build_version_sdk
item['timezone_olson'] = timezone_olson
formatted_json = json.dumps(items, indent = 4,sort_keys=True)
with open(os.path.join('out', ro_product_model+".json"), "w") as f:
f.write(formatted_json)
yield item
I expect the output files would be 58 items to 58 .json files into my out folder
Thank you,
Palash

Related

youtube scraper requests & urllib.parse

trying to get sub count for each video channel I get but don't know why it's not working
class YoutubeSearch:
def __init__(self, search_terms: str, max_results=None):
self.search_terms = search_terms
self.max_results = max_results
self.videos = self._search()
def _search(self):
encoded_search = urllib.parse.quote_plus(self.search_terms)
BASE_URL = "https://youtube.com"
url = f"{BASE_URL}/results?search_query={encoded_search}"
response = requests.get(url).text
while "ytInitialData" not in response:
response = requests.get(url).text
results = self._parse_html(response)
if self.max_results is not None and len(results) > self.max_results:
return results[: self.max_results]
return results
def _parse_html(self, response):
results = []
start = (
response.index("ytInitialData")
+ len("ytInitialData")
+ 3
)
end = response.index("};", start) + 1
json_str = response[start:end]
data = json.loads(json_str)
videos = data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"][
"sectionListRenderer"
]["contents"][0]["itemSectionRenderer"]["contents"]
# get subscriberCountText from each video
for video in videos:
res = {}
if "videoRenderer" in video.keys():
video_data = video.get("videoRenderer", {})
#res["id"] = video_data.get("videoId", None)
res["title"] = video_data.get("title", {}).get("runs", [[{}]])[0].get("text", None)
#res["channel"] = video_data.get("longBylineText", {}).get("runs", [[{}]])[0].get("text", None)
res["url_suffix"] = video_data.get("navigationEndpoint", {}).get("commandMetadata", {}).get("webCommandMetadata", {}).get("url", None)
res["subscribers"] = video_data.get("subscriberCountText", {}).get("simpleText", 0)
results.append(res)
return results
def to_json(self, clear_cache=True):
result = json.dumps({"videos": self.videos})
if clear_cache:
self.videos = ""
return result
Everthing works except sub count and I know it's semantic error
in
res["subscribers"] = video_data.get("subscriberCountText", {}).get("simpleText", 0)
for subscriberCountText i think i should get each channel data i get but how?

Web Scrapper for Job Search Platform not Scrapping Content

I am trying to scrape data from a job searching platform called Job Street. The web crawler works but the generated csv file is empty with no data. The expected output should be a list of jobs with the job title, description, etc.
Below is my code. I am performing this task using selenium. I would really appreciate the help. Thank you in advanced.
headers = {'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) (KHTML, like Gecko) Chrome/102.0.5005.61 '}
path = "/Users/Downloads/jobstreet-scraper-main/chromedriver"
driver = Chrome(executable_path=path)
time.sleep(2)
base_url = "https://www.jobstreet.com.my/en/job-search/{}-jobs/{}/"
def get_page_number(keyword):
#input: keyword for job_postings
#output: number of pages
url = base_url.format(keyword, 1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
result_text = soup.find("span",{"class": "sx2jih0 zcydq84u _18qlyvc0 _18qlyvc1x _18qlyvc1 _1d0g9qk4 _18qlyvc8"})
results = result_text.text.split()
result = result_text.text.split()[-2]
result1 = result.replace(',','')
result2 = int(result1)
page_number = math.ceil(result2/30)
return page_number
def job_page_scraper(link):
url = "https://www.jobstreet.com.my"+link
print("scraping...", url)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
scripts = soup.find_all("script")
for script in scripts:
if script.contents:
txt = script.contents[0].strip()
if 'window.REDUX_STATE = ' in txt:
jsonStr = script.contents[0].strip()
jsonStr = jsonStr.split('window.REDUX_STATE = ')[1].strip()
jsonStr = jsonStr.split('}}}};')[0].strip()
jsonStr = jsonStr+"}}}}"
jsonObj = json.loads(jsonStr)
job = jsonObj['details']
job_id = job['id']
job_expired = job['isExpired']
job_confidential = job['isConfidential']
job_salary_min = job['header']['salary']['min']
job_salary_max = job['header']['salary']['max']
job_salary_currency = job['header']['salary']['currency']
job_title = job['header']['jobTitle']
company = job['header']['company']['name']
job_post_date = job['header']['postedDate']
job_internship = job['header']['isInternship']
company_website = job['companyDetail']['companyWebsite']
company_avgProcessTime = job['companyDetail']['companySnapshot']['avgProcessTime']
company_registrationNo = job['companyDetail']['companySnapshot']['registrationNo']
company_workingHours = job['companyDetail']['companySnapshot']['workingHours']
company_facebook = job['companyDetail']['companySnapshot']['facebook']
company_size = job['companyDetail']['companySnapshot']['size']
company_dressCode = job['companyDetail']['companySnapshot']['dressCode']
company_nearbyLocations = job['companyDetail']['companySnapshot']['nearbyLocations']
company_overview = job['companyDetail']['companyOverview']['html']
job_description = job['jobDetail']['jobDescription']['html']
job_summary = job['jobDetail']['summary']
job_requirement_career_level = job['jobDetail']['jobRequirement']['careerLevel']
job_requirement_yearsOfExperience = job['jobDetail']['jobRequirement']['yearsOfExperience']
job_requirement_qualification = job['jobDetail']['jobRequirement']['qualification']
job_requirement_fieldOfStudy = job['jobDetail']['jobRequirement']['fieldOfStudy']
#job_requirement_industry = job['jobDetail']['jobRequirement']['industryValue']['label']
job_requirement_skill = job['jobDetail']['jobRequirement']['skills']
job_employment_type = job['jobDetail']['jobRequirement']['employmentType']
job_languages = job['jobDetail']['jobRequirement']['languages']
job_benefits = job['jobDetail']['jobRequirement']['benefits']
job_apply_url = job['applyUrl']['url']
job_location_zipcode = job['location'][0]['locationId']
job_location = job['location'][0]['location']
job_country = job['sourceCountry']
return [job_id, job_title, job_expired, job_confidential, job_salary_max, job_salary_max, job_salary_currency, company, job_post_date, job_internship, company_website, company_avgProcessTime, company_registrationNo, company_workingHours, company_facebook, company_size, company_dressCode, company_nearbyLocations, company_overview, job_description, job_summary, job_requirement_career_level, job_requirement_fieldOfStudy, job_requirement_yearsOfExperience, job_requirement_qualification, job_requirement_skill, job_employment_type, job_languages, job_benefits, job_apply_url, job_location_zipcode, job_location, job_country]
def page_crawler(keyword):
# input: keyword for job postings
# output: dataframe of links scraped from each page
# page number
page_number = get_page_number(keyword)
job_links = []
for n in range(page_number):
print('Loading page {} ...'.format(n+1))
url = base_url.format(keyword, n+1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#extract all job links
links = soup.find_all('a',{'class':'sx2jih0'})
job_links += links
jobs = []
for link in job_links:
job_link = link['href'].strip().split('?', 1)[0]
jobs.append([keyword, job_link] + job_page_scraper(job_link))
result_df = pd.DataFrame(jobs, columns = ['keyword', 'link', 'job_id', 'job_title', 'job_expired', 'job_confidential', 'job_salary_max', 'job_salary_max', 'job_salary_currency', 'company', 'job_post_date', 'job_internship', 'company_website', 'company_avgProcessTime', 'company_registrationNo', 'company_workingHours', 'company_facebook', 'company_size', 'company_dressCode', 'company_nearbyLocations', 'company_overview', 'job_description', 'job_summary', 'job_requirement_career_level', 'job_requirement_fieldOfStudy', 'job_requirement_yearsOfExperience', 'job_requirement_qualification', 'job_requirement_skill', 'job_employment_type', 'job_languages', 'job_benefits', 'job_apply_url', 'job_location_zipcode', 'job_location', 'job_country'])
return result_df
def main():
# a list of job roles to be crawled
key_words = ['medical doctor']
dfs = []
for key in key_words:
key_df = page_crawler(key)
dfs.append(key_df)
# save scraped information as csv
pd.concat(dfs).to_csv("job_postings_results.csv")
if __name__ == '__main__':
main()

how to dump a json dict completely? (so far just dumpt last page )

I couldn't dump the json dict completly
I just could dump the json dict last page. Help me,please.
the coding showed below:
def job_list(url):
htmlFile = requests.get(url)
objSoup = bs4.BeautifulSoup(htmlFile.text,'lxml')
jobs = objSoup.find_all('article',class_='js-job-item')
job_list = []
for job in jobs:
cust_name = job.get('data-cust-name')
print("公司名稱:",cust_name)
job_name = job.get('data-job-name')
print("職稱名稱:",job_name)
d = [('公司名稱',cust_name),('職務名稱', job_name)]
j_dict = dict(d)
job_list.append(j_dict)
url_H = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=藥師&order=1&asc=0&page='
url_T = '&mode=s&jobsource=2021indexpoc'
page_total = 2
for page in range(page_total):
url = url_H+str(page+1)+url_T
job_list(url)
print('-'*70)
time.sleep(random.randint(3,5))
myjob = {'Job':job_list}
fn = '104爬蟲.json'
with open(fn, "w") as fnObj:
json.dump(myjob,fnObj,indent=2,ensure_ascii=False)
Try this code
jobs_to_dump = [] #### added
def job_list(url):
htmlFile = requests.get(url)
objSoup = bs4.BeautifulSoup(htmlFile.text, 'lxml')
jobs = objSoup.find_all('article', class_='js-job-item')
job_list = []
for job in jobs:
cust_name = job.get('data-cust-name')
print("公司名稱:", cust_name)
job_name = job.get('data-job-name')
print("職稱名稱:", job_name)
d = [('公司名稱', cust_name), ('職務名稱', job_name)]
j_dict = dict(d)
jobs_to_dump.append(j_dict) ###modified
url_H = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=藥師&order=1&asc=0&page='
url_T = '&mode=s&jobsource=2021indexpoc'
page_total = 2
for page in range(page_total):
url = url_H + str(page + 1) + url_T
job_list(url)
print('-' * 70)
time.sleep(random.randint(3, 5))
myjob = {'Job': jobs_to_dump} #### modified
fn = '104爬蟲.json'
with open(fn, "w") as fnObj:
json.dump(myjob, fnObj, indent=2, ensure_ascii=False)

Python Scrapy Spider Not Following Correct Link

I am trying to scrape the data off of this post. I am having an issue with scraping the comments however. The pagination of the comments is determined by the "page=1" at the end of the url. I noticed that if "page=0" is used it loads all the comments on one page which is really nice. However, my scrapy script will only scrape the comments from the first page, no matter what. Even if I change the link to "page=2" it still will only scrape the comments from the first page. I can not figure out why this issue is occurring.
import scrapy
from scrapy.crawler import CrawlerProcess
class IdeaSpider(scrapy.Spider):
name = "IdeaSpider"
def start_requests(self):
yield scrapy.Request(
url="https://www.games2gether.com/amplitude-studios/endless-space-2/ideas/1850-force-infinite-actions-to"
"-the-bottom-of-the-queue?page=0", callback=self.parse_idea)
# parses title, post, status, author, date
def parse_idea(self, response):
post_author = response.xpath('//span[#class = "username-content"]/text()')
temp_list.append(post_author.extract_first())
post_categories = response.xpath('//a[#class = "list-tags-item ng-star-inserted"]/text()')
post_categories_ext = post_categories.extract()
if len(post_categories_ext) > 1:
post_categories_combined = ""
for category in post_categories_ext:
post_categories_combined = post_categories_combined + category + ", "
temp_list.append(post_categories_combined)
else:
temp_list.append(post_categories_ext[0])
post_date = response.xpath('//div[#class = "time-date"]/text()')
temp_list.append(post_date.extract_first())
post_title = response.xpath('//h1[#class = "title"]/text()')
temp_list.append(post_title.extract()[0])
post_body = response.xpath('//article[#class = "post-list-item clearfix ng-star-inserted"]//div[#class = '
'"post-list-item-message-content post-content ng-star-inserted"]//text()')
post_body_ext = post_body.extract()
if len(post_body_ext) > 1:
post_body_combined = ""
for text in post_body_ext:
post_body_combined = post_body_combined + " " + text
temp_list.append(post_body_combined)
else:
temp_list.append(post_body_ext[0])
post_status = response.xpath('//p[#class = "status-title"][1]/text()')
if len(post_status.extract()) != 0:
temp_list.append(post_status.extract()[0])
else:
temp_list.append("no status")
dev_name = response.xpath('//div[#class = "ideas-details-status-comment user-role u-bdcolor-2 dev"]//p[#class '
'= "username user-role-username"]/text()')
temp_list.append(dev_name.extract_first())
dev_comment = response.xpath('//div[#class = "message post-content ng-star-inserted"]/p/text()')
temp_list.append(dev_comment.extract_first())
c_author_index = 0
c_body_index = 0
c_author_path = response.xpath('//article[#class = "post-list-item clearfix two-columns '
'ng-star-inserted"]//span[#class = "username-content"]/text()')
while c_author_index < len(c_author_path):
comment_author = c_author_path[c_author_index]
temp_list.append(comment_author.extract())
c_author_index += 1
c_body_combined = ""
c_body_path = '//div[#class = "post-list-comments"]/g2g-comments-item[1]/article[#class = ' \
'"post-list-item clearfix two-columns ng-star-inserted"]/div/div//div[#class ' \
'="post-list-item-message-content post-content ng-star-inserted"]//text() '
c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
c_body_list = c_body.extract()
if len(c_body_list) > 1:
for word in c_body_list:
c_body_combined = c_body_combined + " " + word
temp_list.append(c_body_combined)
c_body_index += 1
elif len(c_body_list) != 0:
temp_list.append(c_body_list[0])
c_body_index += 1
elif len(c_body_list) == 0:
c_body_index += 1
c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
c_body_list = c_body.extract()
if len(c_body_list) > 1:
for word in c_body_list:
c_body_combined = c_body_combined + " " + word
temp_list.append(c_body_combined)
c_body_index += 1
temp_list = list()
all_post_data = list()
process = CrawlerProcess()
process.crawl(IdeaSpider)
process.start()
print(temp_list)
This is because the comment pages are loaded using JavaScript and Scrapy is not rendering JavaScript. You could use Splash.

python issue to join relative url to absolute url for img

I'm facing the following issues with my current code to make it work. I just concatenate the URL but its not working:
Current relative path (this is what I get with normal response.xpath crawl):
/imagename.jpg
This is my current code:
class MercadoSpider(CrawlSpider):
name = 'extractor'
item_count = 0
rules = {
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//*[#id="main-container"]/div/div[2]/div[1]/ul/li[7]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[#id="main-container"]/div/div[2]/div[2]/div/div/div/h4/a')),
callback = 'parse_item', follow = False)
}
def parse_item(self, response):
ml_item = MercadoItem()
ml_item['titulo'] = response.xpath('normalize-space(//*[#id="main-container"]/div/div[2]/div[1]/div[2]/h2)').extract_first()
ml_item['sku'] = response.xpath('normalize-space(//*[#id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[2]/a)').extract()
ml_item['marca'] = response.xpath('normalize-space(//*[#id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[1]/a)').extract()
ml_item['tecnologia'] = response.xpath('normalize-space(//*[#id="DetailedSpecs"]/table/tbody/tr[4]/td)').extract_first()
ml_item['tipo'] = response.xpath('normalize-space(//*[#id="DetailedSpecs"]/table/tbody/tr[3]/td)').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[#id="main-container"]/div/div[2]/div[1]/div[2]/div[1]/span[2])').extract()
ml_item['color'] = response.xpath('normalize-space(//*[#id="mainC"]/div/div/div/div/ul/li/b)').extract()
ml_item['potencia'] = response.xpath('normalize-space(//*[#id="ProductReview"]/div/div/div/dl/dd/strong)').extract()
ml_item['condicion'] = response.xpath('normalize-space(//*[#class="stock in-stock"])').extract_first()
ml_item['desc_corta'] = response.xpath('normalize-space(//*[#id="tab-additional_information"])').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//*[#id="main-container"]/div/div[2]/div[2]/div)').extract()
ml_item['id_publicacion'] = response.xpath('normalize-space(//*[#id="mainC"]/div/div/div[11]/div[1]/ul/li[1]/b)').extract()
#imagenes del producto
xpath1 = 'http://www.website.com.ar'
xpath2 = response.xpath('//*[#id="main-container"]/div/div[2]/div[1]/div[1]/p/img/#src').extract_first()
ml_item['image_urls'] = xpath1 + xpath2
ml_item['image_name'] = response.xpath('//*[#id="main-container"]/div/div[2]/div[1]/div[1]/p/img/#src').extract()
#info de la tienda o vendedor
ml_item['categoria'] = response.xpath('normalize-space(//*[#class="woocommerce-breadcrumb breadcrumbs"])').extract_first()
self.item_count += 1
if self.item_count > 10000:
raise CloseSpider('item_exceeded')
yield ml_item
try
absolute_url = response.urljoin(your_url_from_xpath)
scrapy documentation

Categories

Resources