How to send objets from one rule to other with Scrapy - python

I'm trying to scrape Glassdoor company rating, and at some point, I need to send some objects from one rule to the other.
This is the main link for the search: https://www.glassdoor.com/Reviews/lisbon-reviews-SRCH_IL.0,6_IM1121.htm
I access this page on the first Rule, get some information, then I need to go to another link from this page, to enter into the reviews page following the XPath expression //a[#class='eiCell cell reviews '].
Here is the problem, how can I follow this link with the XPath expression, inside the parse_item without losing the information that I got?
class GetComentsSpider(CrawlSpider):
name = 'get_coments'
allowed_domains = ['www.glassdoor.com']
start_urls = ['http://https://www.glassdoor.com/Reviews/portugal-reviews-SRCH_IL.0,8_IN195.htm/']
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
download_delay = 0.1
rules = (
#Acess the page, get the link from each company and move to parse_item
Rule(LinkExtractor(restrict_xpaths="//div[#class=' margBotXs']/a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths="//a[#class='eiCell cell reviews ']"), callback='parse_item', follow=True),
#Pagination
Rule(LinkExtractor(restrict_xpaths="//li[#class='next']/a"), follow=True),
)
def parse_item(self, response):
#get company name and rating
name = response.xpath("(//span[#class='updateBy'])[1]").get()
rating = response.xpath("//span[#class='bigRating strong margRtSm h1']/text()").get()
#Here i need to go to the link of //a[#class='eiCell cell reviews '] to get more data
#without losing the name and rating
yield {
"Name" : name,
"Rating" : rating
}

you can send to other parser usin Request(..., meta=...)
(and you don't need Rule to get url for this request)
def parse_item(self, response):
name = response.xpath("(//span[#class='updateBy'])[1]").get()
rating = response.xpath("//span[#class='bigRating strong margRtSm h1']/text()").get()
item = {
"Name" : name,
"Rating" : rating
}
url = ... #Here i need to go to the link of //a[#class='eiCell cell reviews '] to get more data
yield Request(url, callback='other_parser', meta={"item": item})
def other_parser(self, response):
item = response.meta['item']
item['other'] = ... # add values to item
yield item

Related

Scrapy (with headers) is not extracting or parsing the item_urls

I'm building a Scrapy crawler/spider for a determined website where I will send scrapy a starting url (let's call it start_urls) and it will get the response of all the urls (based on determined parameters) that are contained on it
Summarizing: It should enter into start_url and then search for the company_urlsĀ given an allow parameter in the rule and return the response of each company url (everything done with headers). My code is only parsing the start_urls, not considering the urls I want to extract for the parser. What could be wrong?
LinkExtractor
link_extractor = LinkExtractor(
allow=['/organization/'],
allow_domains=['www.scrapsite.com'],
deny_extensions=IGNORED_EXTENSIONS, # Filter *.zip, *.csv, etc (add other extensions as required)
process_value=lambda url: process_url_value(url, NAME, cleaning_regex=[company_regex]),
)
ScrapySpider
class scrapsiteSpider(CrawlSpider):
name = NAME
download_delay = 5.0
main_page = MAIN_PAGE
HEADERS = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
}
start_urls = [
f'https://www.scrapsite.com/search/companies/field/companies/company_page/{rank}'
for rank in range(75, 132, 10)
rules = [Rule(link_extractor, callback='parse', follow=True)]
headers = HEADERS
#classmethod
def start_requests(cls):
logger.info('Starting scrapsite scraping')
for url in cls.start_urls:
cls.log_counter += 1
if cls.log_counter % cls.log_divider == 0:
logger.info(f'Start request: {url}')
yield Request(url, dont_filter=True, headers=HEADERS)
#classmethod
def parse(cls, response: Response):
# CAPTURE COMPANIES
logger.info(f"#### parse PREPROCESSING company {response.url}")
logger.info(f"{response.meta}")
if company_regex.search(response.url):
logger.info(f"Company Detected: {response.url.split('/')[-1]}")
return cls.parse_item(response, AddedItem())
#classmethod
def parse_item(cls, response: Response, item: Item) -> Item:
logger.info(f"#### parse_item PREPROCESSING company {response.url}")
item.set_url(value=response.url)
item.set_source(value=cls.name)
item.set_response_data(value=response.text)
item.set_uuid(value=make_id_from_url(url=response.url))
yield item

Want to remove some text from the line

I need only the address not need tel, Fax, Email When I run the code they give me the whole data but I want only the address this is page link https://all.accor.com/hotel/8392/index.de.shtml
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://all.accor.com/de/region/hotels-sachsen-dsn.shtml']
page_number = 0
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//a[#class='Teaser-link']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title=response.xpath("//h3//text()").get()
address = response.xpath("//div[#class='infos__content']//p//text()")[:-3].getall()
address = [i.strip() for i in address]
# remove empty strings:
address = [i for i in address if i]
yield{
'name':title,
'address':address,
}
Your xpath selector for the address is wrong. You need to restrict the text you want to be from the first child of the div with a class of infos__content. Use the below code for the parse_book method and it should work.
def parse_book(self, response):
title=response.xpath("//h3//text()").get()
address = response.xpath("normalize-space(//div[#class='infos__content']/div[1]/p)").get()
address = address.replace("\xa0", " ")
yield{
'name':title,
'address':address,
}

How to scrape element form scraped url? Scrapy

Ok, let's say I have a website that has listed job offers and there are multiple pages (dynamic, that why I'm, using selenium). What I want to do:
Scrape every job post URL on multiple pages,
Scrape from every URL few items (title, localization, etc.)
class JobScraper(scrapy.Spider):
name = "jobscraper"
allowed_domains = ['pracuj.pl']
total = 10
start_urls = [
'https://www.pracuj.pl/praca/it%20-%20rozw%c3%b3j%20oprogramowania;cc,5016/%c5%82%c3%b3dzkie;r,5?rd=10&pn={}'.format(i)
for i in range(1, total)
]
custom_settings = {
'LOG_LEVEL': 'INFO',
}
def __init__(self):
self.options = webdriver.ChromeOptions()
self.options.headless = True
self.driver = webdriver.Chrome(r'C:\Users\kacpe\OneDrive\Pulpit\Python\Projekty\chromedriver.exe',
options=self.options)
def parse(self, response):
self.driver.get(response.url)
res = response.replace(body=self.driver.page_source)
offers = res.xpath('//li[contains(#class, "results__list-container")]')
for offer in offers:
link = offer.xpath('.//a[#class="offer-details__title-link"]/#href').extract()
yield Request(link, callback=self.parse_page)
def parse_page(self, response):
title = response.xpath('//h1[#data-scroll-id="job-title"]/text()').extract()
yield {
'job_title': title
}
And it doesn't work, an error that occurred:
TypeError: Request url must be str or unicode, got list
You call extract in this line:
link = offer.xpath('.//a[#class="offer-details__title-link"]/#href').extract()
Extract returns a list of elements, which is why you get the error when you try to pass link to Request.
Depending what you wanted to do, you could do for link in links and Request each result, or get a specific xpath by using find_elements_by_xpath.
You don't need selenium to scrape the required content. Turns out that the items you wish to grab from that site are within some script tag. Once you scoop out that portion using regex and process it using json library, you should access them very easily. The following is how I meant:
import json
import scrapy
class JobScraper(scrapy.Spider):
name = "jobscraper"
total = 10
start_urls = [
'https://www.pracuj.pl/praca/it%20-%20rozw%c3%b3j%20oprogramowania;cc,5016/%c5%82%c3%b3dzkie;r,5?rd=10&pn={}'.format(i)
for i in range(1, total)
]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
}
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url,callback=self.parse,headers=self.headers)
def parse(self, response):
items = response.css("script:contains('window.__INITIAL_STATE__')::text").re_first(r"window\.__INITIAL_STATE__ =(.*);")
for item in json.loads(items)['offers']:
yield {
"title":item['jobTitle'],
"employer":item['employer'],
"country":item['countryName'],
"details_page":item['companyProfileUrl']
}

Scrapy does not fetch markup on response.css

I've built a simple scrapy spider running on scrapinghub:
class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def parse(self, response):
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}
The problem I am facing is that the multiple_locs_url response.css returns an empty array despite me seeing it in the markup on the browser side.
I checked with scrapy shell and scrapy shell does not see the markup. I guess this is due to the markup being rendered through javascript when the page is loaded.
I added splash but that does not seem to apply to response. How would I make scrapy wait with the query until the page is loaded?
See source code for the page: view-source:pracuj.pl/praca/polska;ct,1 .
There is no element with class "offer-regions__label" in html code.
This code will always return an empty list:
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)')
But as explained here https://stackoverflow.com/a/17697329/9913319:
Many times when crawling we run into problems where content that is
rendered on the page is generated with Javascript and therefore scrapy
is unable to crawl for it.
In this case you can use Selenium.
I changed your code and checked it and it works:
class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def __init__( self, **kwargs ):
super().__init__( **kwargs )
profile = webdriver.FirefoxProfile( "pathToFirefoxProfile" )
firefox_binary = "pathToFirefoxBinary" # Must be the developer edition!!!
# self.driver = webdriver.Firefox()
self.driver = webdriver.Firefox( profile, firefox_binary = firefox_binary )
def parse(self, response):
self.driver.get( response.url )
elements = self.driver.find_elements_by_css_selector( "a.offer-details__title-link" )
self.driver.get( response.url )
for element in elements:
print( "****" )
print( str( element.get_attribute( "href" ) ) )
print( str( element.text ) )
# your old code below
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}

How to force Scrapy to show all items instead of just the last one?

Having the following spider:
import scrapy
class ScrapeNames(scrapy.Spider):
name='final2'
start_urls = [
'https://www.trekearth.com/members/'
]
def parse(self, response):
for entry in response.xpath('//table[#class="member-table"]'):
for name in entry.xpath('.//tr[#class="row"]/td/p/a/text()|.//tr/td/p/a/text()').extract():
item['name'] = name
for photo in entry.xpath('.//tr[#class="row"]/td[6]/a/text()|.//tr[#class="row"]/td[6]/text()|.//tr/td[6]/text()|.//tr/td[6]/a/text()').extract():
item['photo'] = photo
yield item
I want to extract number of photos taken by the user and then export it to csv. However in my .csv I only have the last item in the table on this page (see screenshot bellow).
What I want is obviously to have a member name and number of photos taken for all of the users on a page. What am I doing wrong? How to fix this?
EDIT:
Possibly this is essential as well but my items.py file looks like this:
import scrapy
class FinalItem(scrapy.Item):
name = scrapy.Field()
photo = scrapy.Field()
pass
FOLLOW UP QUESTION:
I have introduced some improvements into my code which is now:
class ScrapeMovies(scrapy.Spider):
name='final2'
start_urls = [
'https://www.trekearth.com/members/'
]
def parse(self, response):
item = FinalItem()
for entry in response.xpath('//table[#class="member-table"]'):
for name in entry.xpath('.//tr[#class="row"]/td/p/a/text()|.//tr/td/p/a/text()').extract():
names = entry.xpath('.//tr[#class="row"]/td/p/a/text()|.//tr/td/p/a/text()').extract()
item['name'] = ";".join(names)
for photos in entry.xpath('.//tr[#class="row"]/td[6]/a/text()|.//tr[#class="row"]/td[6]/text()|.//tr/td[6]/text()|.//tr/td[6]/a/text()').extract():
photos = entry.xpath('.//tr[#class="row"]/td[6]/a/text()|.//tr[#class="row"]/td[6]/text()|.//tr/td[6]/text()|.//tr/td[6]/a/text()').extract()
item['photo'] = ";".join(photos)
yield item
However this created a mess in final .csv which now looks like this:
Is there a simple way to fix this?
Sample desired output in .csv bellow:
EDIT2:
My spider now:
import scrapy
from final.items import FinalItem
class ScrapeMovies(scrapy.Spider):
name='final2'
start_urls = [
'https://www.trekearth.com/members/'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
item['photos'] = row.xpath('string(./td[6])').extract_first()
yield item
Still does not yield proper result. I have empty .csv only. Updated settings.py
UPDATE
You need to have this line in your settings.py (site blocks default Scrapy user-agent):
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36'
and next this will work:
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
item['photos'] = row.xpath('string(./td[6])').extract_first()
yield item

Categories

Resources