I am relatively new to python and scrapy.
I am trying to scrape the job portal https://www.jobs.ch/de/. Currently I start with https://www.jobs.ch/de/stellenangebote/administration-hr-consulting-ceo/.
At the moment, the scraper works fine, but is not returning all jobresults. Out of 24 results per page, scrapy is returning alternating amount of results (tested with 2 pages: 21/24 and 23/24). I checked if the css path is different for the missing results, but they are identical. Anybody an idea why I don't get all results? Would really appreciate all suggestions.
import scrapy
from jobscraping.items import JobscrapingItem
class GetdataSpider(scrapy.Spider):
name = 'getdata5'
start_urls = ['https://www.jobs.ch/de/stellenangebote/administration-hr-consulting-ceo/']
def parse(self, response):
yield from self.scrape(response)
next_page = response.css('div.sc-AxiKw.Flex-sc-8fidy7-0.itnOWY > a.sc-fznxsB.fvMaWZ.Link-sc-1vy3ms6-1.fvbIfL:last-child').attrib['href']
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def scrape(self, response):
for add in response.css('div.sc-AxiKw.VacancySerpItem__ShadowBox-qr45cp-0.hqhfbd'):
item = JobscrapingItem()
addpage = response.urljoin(add.css('div.sc-AxiKw.VacancySerpItem__ShadowBox-qr45cp-0.hqhfbd a::attr(href)').get(default='not-found'))
item['addlink'] = addpage
item['Position'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.VacancySerpItem___StyledText-qr45cp-6.gHnsfC::text').get(default='not-found')
item['Company'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.YSULY > strong::text').get(default='not-found')
item['Location'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.YSULY::text').get(default='not-found')
request1 = scrapy.Request(addpage, callback=self.get_addinfos)
request1.meta['item'] = item
yield request1
def get_addinfos(self, response):
for details in response.css('div.sc-AxiKw.Flex-sc-8fidy7-0.VacancyDetailHead__StyledVacancyDetailHead-sc-14lkltl-0.VacancyDetailHead___StyledStyledVacancyDetailHead-sc-14lkltl-1.deEQGn'):
item = response.meta['item']
companypage = response.urljoin(details.css('div.sc-AxiKw.XkVWn > span > div > a::attr(href)').get(default='not-found'))
item['companylink'] = companypage
item['Date'] = details.css('div.sc-AxiKw.bFWxot > span:nth-child(1) > span.sc-fzqNJr.cPhlSZ::text').get(default='not-found')
item['Rank'] = details.css('div.sc-AxiKw.bFWxot > span:nth-child(2) > span.sc-fzqNJr.cPhlSZ::text').get(default='not-found')
item['Workload'] = details.css('span.sc-fzqNJr.Badge-sc-88vuku-0.dCIQfi::text').get(default='not-found')
request2 = scrapy.Request(companypage, callback=self.get_companyinfos)
request2.meta['item'] = item
yield request2
def get_companyinfos(self, response):
item = response.meta['item']
item['Industry'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.leHDqM::text').get(default='not-found')
item['Open_jobs'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.fMPCQO > span::text').get(default='not-found')
item['Employees'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.GqJfV > span::text').get(default='not-found')
item['Rating_overall'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.gKcdxd::text').get(default='not-found')
item['Rating_detailed'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.hVUXAg::text').getall()
item['Rating_numbers'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.kxNaWG::text').get(default='not-found')
yield item
The items.py file:
import scrapy
class JobscrapingItem(scrapy.Item):
# define the fields for your item here like:
addlink = scrapy.Field()
Position = scrapy.Field()
Company = scrapy.Field()
Location = scrapy.Field()
Date = scrapy.Field()
Rank = scrapy.Field()
Workload = scrapy.Field()
companylink = scrapy.Field()
Industry = scrapy.Field()
Open_jobs = scrapy.Field()
Employees = scrapy.Field()
Rating_overall = scrapy.Field()
Rating_detailed = scrapy.Field()
Rating_numbers = scrapy.Field()
I found the mistake in my code. Since some of the companies posted more than one posting, the scraper removed duplicates. I set dont_filter=True and this resolved the issue. I also included an if statement for the postings which do not have a companylink, so the scraper yields these items before continuing to crawl the companypage.
Related
I am new to scrapy and this is my first try in web scraping. Structure of the webpage fro which I am trying to scrape is following:
level 0: Main company URL ---> level 1: several associated company URLs ----> level 2: each associated company URL in level 1 has many URLs linked ---> ... upto level n
Right now I can scrape data upto level 1. But I want to do it upto n th level recursively. There should be a control like max_depth upto which I want to scrape.
I can not figure out how to do it.
Here is my spider which I wrote so far:
import scrapy
from ..items import *
class NodeSpider(scrapy.Spider):
name = 'nodes'
start_urls = ['https://www.zaubacorp.com/companysearchresults/DOIT-']
base_url = 'https://www.zaubacorp.com/'
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
def parse(self, response):
search_links = response.xpath('//table[#id="results"]/tr/td/a[contains(#href,"company/DOIT-URBAN")]/#href').getall()
page_list = search_links[1:]
#url = search_links.pop(0)
check_list = []
for url in search_links:
print("func 1")
yield response.follow(url=url, callback=self.parse_doit,meta={'page_list':page_list,
'check_list':check_list
})
def parse_doit(self, response):
print("func 2")
check_list = response.meta['check_list']
lnk = MainLink()
lnk['name'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]/p//text()').get()
lnk['url'] = response.url
lnk['address'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[4]/text()').get()
lnk['email'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[1]/text()').get()
lnk['director1'] = response.xpath('//*[#id="package1"]/td[2]//p//text()').get()
lnk['director2'] = response.xpath('//*[#id="package2"]/td[2]//p//text()').get()
dir1_same_co_list = response.xpath('//*[#id="accordion1"]/table[1]//td//p/a/#href').getall()
dir2_same_co_list = response.xpath('//*[#id="accordion2"]/table[1]//td//p/a/#href').getall()
co_list = dir1_same_co_list + list(set(dir2_same_co_list)-set(dir1_same_co_list))
dir_same_co_list = list(set(co_list)-set(check_list))
check_list = check_list + list(set(dir_same_co_list)-set(check_list))
page_list = response.meta['page_list']
if dir1_same_co_list:
next_url_dir = dir_same_co_list.pop(0)
yield response.follow(url = next_url_dir, callback = self.parse_level_2,
meta = {'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'dir_same_co_list':dir_same_co_list,
'page_list':page_list
})
def parse_level_2(self,response):
print("func 3")
lnk = response.meta['name']
lnk = response.meta['url']
lnk = response.meta['address']
lnk = response.meta['email']
lnk = response.meta['director1']
lnk = response.meta['director2']
page_list = response.meta['page_list']
#next_page = response.meta['next_page']
level_2 = SecondaryLink()
try:
lnk['Company_Details_W_Same_Directors']
except:
lnk['Company_Details_W_Same_Directors'] = []
#for sub_link in dir1_same_co_list:
level_2['Co_Name'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]/p//text()').get()
level_2['Co_url'] = response.url
level_2['Address'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[4]/text()').get()
level_2['Email'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[1]/text()').get()
level_2['First_Director'] = response.xpath('//*[#id="package1"]/td[2]//p//text()').get()
level_2['Second_Director'] = response.xpath('//*[#id="package2"]/td[2]//p//text()').get()
lnk['Company_Details_W_Same_Directors'].append(level_2)
dir_same_co_list = response.meta['dir_same_co_list']
print("===== start reading co list =====")
if dir_same_co_list:
next_url_dir = dir_same_co_list.pop(0)
print("co list",len(dir_same_co_list))
yield response.follow(url = next_url_dir, callback = self.parse_level_2,
meta = {'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'dir_same_co_list':dir_same_co_list,
'page_list':page_list
})
else:
if page_list:
print("next page loop")
next_page = page_list.pop(0)
next_page_url = next_page
yield response.follow(url=next_page_url, callback=self.parse_doit, meta={'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'next_page':next_page,
'page_list':page_list})
else:
yield lnk
and the items.py is following:
class MainLink(scrapy.Item):
name = scrapy.Field()
url = scrapy.Field()
address = scrapy.Field()
email = scrapy.Field()
director1 = scrapy.Field()
Company_Details_W_Same_Directors = scrapy.Field()
director2 = scrapy.Field()
pass
class SecondaryLink(scrapy.Item):
Co_Name = scrapy.Field()
Co_url = scrapy.Field()
Address = scrapy.Field()
Email = scrapy.Field()
First_Director = scrapy.Field()
Second_Director = scrapy.Field()
pass ```
Help is much appreciated
You can make use of the DEPTH_LIMIT in scrapy. Please see https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit
I'm still new to scrapy. When trying to read data from quotes.toscrape, I don't get any content back when using xpath selectors. As soon as I use css selectors everything works as intended. I just can't find the error even though the example is super simple.
quotes.py
import scrapy
from quotes_loader.items import QuotesLoaderItem as QL
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = [
'http://quotes.toscrape.com//']
def parse(self, response):
item = QL()
quotes = response.xpath('//div[#class="quote"]')
for quote in quotes:
# CSS-Selector
# item['author_name'] = quote.css('small.author::text').get()
# item['quote_text'] = quote.css('span.text::text').get()
# item['author_link'] = quote.css('small.author + a::attr(href)').get()
# item['tags'] = quote.css('div.tags > a.tag::text').get()
# XPATH-Selektor
item['author_name'] = quote.xpath('//small[#class="author"]/text()').get()
item['quote_text'] = quote.xpath('//span[#class="text"]/text()').get()
item['author_link'] = quote.xpath('//small[#class="author"]/following-sibling::a/#href').get()
item['tags'] = quote.xpath('//*[#class="tags"]/*[#class="tag"]/text()').get()
yield item
# next_page_url = response.css('li.next > a::attr(href)').get()
next_page_url = response.xpath('//*[class="next"]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
items.py
import scrapy
from scrapy.loader import ItemLoader
class QuotesLoaderItem(scrapy.Item):
# define the fields for your item here like:
author_name = scrapy.Field()
quote_text = scrapy.Field()
author_link = scrapy.Field()
tags = scrapy.Field()
Result
author_name,quote_text,author_link,tags
Albert Einstein,“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”,/author/Albert-Einstein,change
Albert Einstein, ...
...
(20 times)
thank you for your commitment
I use a selector object instead of a respons object and therefore the syntax has to look like this.
import scrapy
from quotes_loader.items import QuotesLoaderItem as QL
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = [
'http://quotes.toscrape.com//']
def parse(self, response):
item = QL()
quotes = response.xpath('//div[#class="quote"]')
for quote in quotes:
# CSS-Selector
# item['author_name'] = quote.css('small.author::text').get()
# item['quote_text'] = quote.css('span.text::text').get()
# item['author_link'] = quote.css('small.author + a::attr(href)').get()
# item['tags'] = quote.css('div.tags > a.tag::text').get()
# XPATH-Selector
item['author_name'] = quote.xpath('.//small[#class="author"]/text()').get()
item['quote_text'] = quote.xpath('.//span[#class="text"]/text()').get()
item['author_link'] = quote.xpath('.//small[#class="author"]/following-sibling::a/#href').get()
item['tags'] = quote.xpath('.//*[#class="tags"]/*[#class="tag"]/text()').get()
yield item
# next_page_url = response.css('li.next > a::attr(href)').get()
next_page_url = response.xpath('.//*[class="next"]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
I'm trying to enhance my skill in webscraping but I'm stuck with my script. I want to scrape some information on Amazon.
Here's my script so far :
import scrapy
from ..items import AmazontutorialItem
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon'
page_number = 2
start_urls = ['https://www.amazon.com/s?bbn=1&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&fst=as%3Aoff&qid=1606224210&rnid=1250225011&ref=lp_1_nr_p_n_publication_date_0']
def parse(self, response):
items = AmazontutorialItem()
product_name = response.css('.a-color-base.a-text-normal::text').extract()
product_author = response.css('.sg-col-12-of-28 span.a-size-base+ .a-size-base::text').extract()
product_price = response.css('.a-spacing-top-small .a-price-whole::text').extract()
product_imagelink = response.css('.s-image::attr(src)').extract()
items['product_name'] = product_name
items['product_author'] = product_author
items['product_price'] = product_price
items['product_imagelink'] = product_imagelink
yield items
next_page = 'https://www.amazon.com/s?i=stripbooks&bbn=1&rh=n%3A283155%2Cn%3A1000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&page=' + str(AmazonSpiderSpider.page_number) + '&fst=as%3Aoff&qid=1606229780&rnid=1250225011&ref=sr_pg_2'
if AmazonSpiderSpider.page_number <= 3:
AmazonSpiderSpider += 1
yield response.follow(next_page, callback = self.parse)
But I get this error :
UnboundLocalError: local variable 'AmazonSpiderSpider' referenced before assignment
I don't understand, I never had this error before, even with webscraping.
Any ideas ? Thanks.
You are trying to access page_number from the class AmazonSpiderSpider inside the class itself. You are trying to do this with AmazonSpiderSpider.page_number, which will most certainly fail. What you were intending to do was probably access self.page_number.
The following should fix your issue:
import scrapy
from ..items import AmazontutorialItem
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon'
page_number = 2
start_urls = ['https://www.amazon.com/s?bbn=1&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&fst=as%3Aoff&qid=1606224210&rnid=1250225011&ref=lp_1_nr_p_n_publication_date_0']
def parse(self, response):
items = AmazontutorialItem()
product_name = response.css('.a-color-base.a-text-normal::text').extract()
product_author = response.css('.sg-col-12-of-28 span.a-size-base+ .a-size-base::text').extract()
product_price = response.css('.a-spacing-top-small .a-price-whole::text').extract()
product_imagelink = response.css('.s-image::attr(src)').extract()
items['product_name'] = product_name
items['product_author'] = product_author
items['product_price'] = product_price
items['product_imagelink'] = product_imagelink
yield items
next_page = 'https://www.amazon.com/s?i=stripbooks&bbn=1&rh=n%3A283155%2Cn%3A1000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&page=' + str(self.page_number) + '&fst=as%3Aoff&qid=1606229780&rnid=1250225011&ref=sr_pg_2'
if self.page_number <= 3:
self.page_number += 1
yield response.follow(next_page, callback = self.parse)
I have a simple spider that crawls local obituaries. The code works perfectly until I try to add two static columns. All I want to do is add the date I pulled the information (pull item) and the state in which it was pulled (state item). It's a self loading page so when I add the pull date, I only get the first 10 results (or only the first page). If I add just the state, I only get two results. When I remove both, I get all 40+ results.
I did # lines that aren't working properly:
Item.py file:
import scrapy
class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
#pull = scrapy.Field()
#state = scrapy.Field()
spider file:
import scrapy
import time
from al.items import AlItem
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browse?type=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
#pull = time.strftime("%m/%d/%Y")
#state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
#new_item['pull'] = pull
#new_item["state"] = state
yield new_item
I explain why:
if you paste in here for item in zip(name, link, obit, news): pull & state, then you will get the number of iterations equal 2 because state = "AL" - string variable. ZIP function get from state two chars and set iteration = 2 for all arguments in loop. zip gets the smallest numb from arguments for iteration. as with the date 01/01/2001 - 10 chars. (will iterations equal 10)
WILL WORKING:
`class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
pull = scrapy.Field()
state = scrapy.Field()`
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browsetype=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
pull = time.strftime("%m/%d/%Y")
state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
new_item['pull'] = pull
new_item["state"] = state
yield new_item
For this URL , I need all the product URLs and their respective TYPE.
So the output should be:
Product_URL1 Blouse
Product_URL2 Crop Top
Product_URL3 Tank Top
Product_URL4 Strappy Top
Product_URL5 Tube Top
Below is my code, I guess everything is right expect the xpath for the item['type']
from scrapy.spiders import CrawlSpider
import scrapy
from scrapy.http.request import Request
class JabongItem(scrapy.Item):
base_link = scrapy.Field()
type = scrapy.Field()
count = scrapy.Field()
product_name = scrapy.Field()
product_link = scrapy.Field()
class JabongScrape(CrawlSpider):
name = "jabong"
allowed_domains = "jabong.com"
start_urls = ["http://www.jabong.com/women/clothing/tops-tees-shirts/tops", "http://www.jabong.com/women/clothing/tops-tees-shirts/tees"]
def parse(self, response):
item=JabongItem()
try:
for idx in range(0, 20):
item['type']=response.xpath("//div[contains(#class, 'options')]/label/a/text()").extract()[idx]
item['base_link']=response.url+response.xpath("//div[contains(#class, 'options')]/label/a/#href").extract()[idx] + "?ax=1&page=1&limit=" + (response.xpath("//div[contains(#class, 'options')]/label/small/text()").extract()[idx]).replace("[","").replace("]","") + "&sortField=popularity&sortBy=desc"
item['count']= (response.xpath("//div[contains(#class, 'options')]/label/small/text()").extract()[idx]).replace("[","").replace("]","")
yield Request(item['base_link'],callback=self.parse_product_link,
meta={'item': item, 'count': int(item['count'])}, dont_filter=True)
except:
pass
def parse_product_link(self,response):
item=response.meta['item']
try:
for i in range(0, response.meta['count']):
item['product_link']=response.xpath("//div[contains(#class, 'col-xxs-6 col-xs-4 col-sm-4 col-md-3 col-lg-3 product-tile img-responsive')]/a/#href").extract()[i]
# item['original_price']=response.xpath("section.row > div:nth-child(1) > a:nth-child(1) > div:nth-child(2) > div:nth-child(2) > span:nth-child(1) > span:nth-child(1)::text").extract()[idx]
print i
yield item
except:
pass
And the jbng_base_links.txt contains "http://www.jabong.com/women/clothing/tops-tees-shirts/tops"
As Rafael pointed out the easiest way of doing this is simply restructuring your spider manually to follow this order:
Go to webpage
Find type urls
Go to every type url -> scrape items
It could be as simple as:
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = []
def parse(self, response):
"""this will parse landing page for type urls"""
urls = response.xpath("//div[contains(text(),'Type')]/..//a/#href").extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Requests(url, self.parse_type)
def parse_type(self, response):
"""this will parse every type page for items"""
type_name = response.xpath("//a[#class='filtered-brand']/text()").extract_first()
product_urls = ...
for url in product_urls:
yield {'type': type_name, 'url': url}
# handle next page