Why my Scrapy Response.follow is not following the links and returns empty? - python

Below you can see my spider (in this case is just a test). As i test, i first run the parse_remedios as the main parse an it returns the results. But when i use as the second function (parse_remedios, like in the code below) the results are empty. I know that has something about the response.follow is not running well. Any ideas?
import scrapy
class RemediosSpider(scrapy.Spider):
name = 'remedios'
allowed_domains = ['www.drogariaspachecopacheco.com.br']
start_urls = ['https://www.drogariaspacheco.com.br/clorana%2025mg']
def parse(self, response):
print(response.url)
for link in response.css('.collection-link::attr(href)'):
yield response.follow(link.get(), callback=self.parse_remedios)
def parse_remedios(self, response):
resultado = response.css('.container-fluid')
yield {
'nome' : resultado.css('.productName::text').get(),
'preco' : resultado.css('.skuBestPrice::text').get() ,
'link' : response.url,
'sku' : resultado.css('.skuReference::text').get()
}

The problem is with your allowed_domains. Scrapy is filtering all of the links after the start_urls because they do not match any of the domains in your allowed_domains list. I have corrected in the example below
you can see it in the output logs
2022-09-15 21:38:24 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.drogariaspacheco.com.br': <GET https://www.drogariaspacheco.com.br/clorana-25mg-sanofi-aventis-30-comprimidos/p>
import scrapy
class RemediosSpider(scrapy.Spider):
name = 'remedios'
allowed_domains = ['drogariaspacheco.com.br']
start_urls = ['https://www.drogariaspacheco.com.br/clorana%2025mg']
def parse(self, response):
for link in response.css('.collection-link::attr(href)'):
yield response.follow(link.get(), callback=self.parse_remedios)
def parse_remedios(self, response):
resultado = response.css('.container-fluid')
yield {
'nome' : resultado.css('.productName::text').get(),
'preco' : resultado.css('.skuBestPrice::text').get() ,
'link' : response.url,
'sku' : resultado.css('.skuReference::text').get()
}

Related

Scrapy Response is returning 'None' even with correct syntax

I'm trying to get the item's names in a dictionary like this:
import scrapy
class TerrorSpider(scrapy.Spider):
name = 'terror'
start_urls = ['http://books.toscrape.com/catalogue/category/books/travel_2/index.html']
def parse(self, response):
for filme in response.css('h3 a'):
yield{
'name': filme.css('h3 a::text').get()
}
And i really don't know why it's returning 'None' in 'name' field (it's returning code 200). When i try to get it on Scrapy Shell, it's works.
I'm expecting to get datas like this other code:
import scrapy
class ImdbSpider(scrapy.Spider):
name = 'imdb'
start_urls = ['https://www.imdb.com/chart/top/?ref_=nv_mv_250']
def parse(self, response):
for filmes in response.css('.titleColumn'):
yield{
'names' : filmes.css('.titleColumn a::text').get(),
'years' : filmes.css('.secondaryInfo ::text').get()[1:-1],
'notes' : response.css('strong ::text').get()
}
it's works correctly and is the same code.
You are trying to get the selector for the selector you have already chosen... In other words all you need to do is call .css('::text').get() on the fileme variables. There is no need to repeat the h3 and a tag elements because you already chose those in the earlier selector.
import scrapy
class TerrorSpider(scrapy.Spider):
name = 'terror'
start_urls = ['http://books.toscrape.com/catalogue/category/books/travel_2/index.html']
def parse(self, response):
for filme in response.css('h3 a'):
yield{
'name': filme.css('::text').get()
}
you could also do:
...
def parse(self, response):
for filme in response.css('h3 a::text').getall():
yield {'name': fileme}
or even:
def parse(self, response): yield from ({'name': fileme} for fileme in response.css('a h3::text').getall())

when I run command 'scrapy crawl Admission' on the error occoured is Filtered offsite request to 'www.worldomets.info'

import scrapy
class AdmissionsSpider(scrapy.Spider):
name = 'Admissions'
allowed_domains = ["www.worldometers.info"]
start_urls = ['https://www.worldometers.info/population/countries-in-asia-by-population/']
def parse(self, response):
countries=response.xpath("//td/a")
for country in countries:
name=country.xpath(".//text()").get()
links=country.xpath(".//#href").get()
absolute_url=f"https://www.worldomets.info{links}"
yield scrapy.Request(url=absolute_url)
I am traying to print countries name but it show mw the error Filtered offsite request to 'www.worldomets.info': <GET https://www.worldomets.info/world-population/china-population/>
You can try it like is
import scrapy
class AdmissionsSpider(scrapy.Spider):
name = 'Admissions'
allowed_domains = ["worldometers.info"]
start_urls = ['https://www.worldometers.info/population/countries-in-asia-by-population/']
def parse(self, response):
countries=response.xpath("//td/a")
for country in countries:
name=country.xpath(".//text()").get()
link=country.xpath(".//#href").get()
link = response.urljoin(link)
# print(link)
# absolute_url=f"https://www.worldomets.info{links}"
yield scrapy.Request(url=link, callback=self.parse_absoluteurl)
def parse_absoluteurl(self, response):
print('\n', response.url ,'\n')
for the details Following hyperlink and "Filtered offsite request"

Scrapy how to add the "query url" as item in the output

i have a list of urls and i import them as start_urls in a scrapy project.
I would like to add in the output the query (url) who generated those results.
For example if i have
"First title results", "Address", etc ----> URL who generated this output.
this is the code i have
import scrapy
from scrapy import Request
class GmapsclosedlocationsSpider(scrapy.Spider):
name = 'gmapsclosedlocations'
allowed_domains = ['https://www.google.com']
with open('urls.csv') as file:
start_urls = [line.strip() for line in file]
def start_request(self):
request = Request(url=self.start_urls, callback=self.parse)
yield request
def parse(self, response):
yield {
'name' : response.css('.qrShPb').extract(),
'closed' : response.css('p.wlxxf::text').extract(),
'address' : response.css('.LrzXr::text').extract(),
'phone' : response.xpath('//*[contains(text(), "+46")]').extract_first(),
'website' : response.css('a.ab_button::attr(href)').extract(),
'firsttitle' : response.css('.DKV0Md::text').extract_first()
}
i would like to have a new item into "yield" that add the url but i have no idea how to do that
thank you!
Just use response.url method to grab each current url inside your yield keyword as follows:
def parse(self, response):
yield {
'name' : response.css('.qrShPb').extract(),
'closed' : response.css('p.wlxxf::text').extract(),
'address' : response.css('.LrzXr::text').extract(),
'phone' : response.xpath('//*[contains(text(), "+46")]').extract_first(),
'website' : response.css('a.ab_button::attr(href)').extract(),
'firsttitle' : response.css('.DKV0Md::text').extract_first(),
'url': response.url}

why my start_request function is not calling my parse function in my scrapy program?

I am trying the scrape the review of this particular IMDB title. But for some reason the start_request is not calling the parse function for this title alone. for another title, it seems to work.
Code examples:
class imdb(scrapy.Spider):
name = 'imdb'
def start_requests(self):
c=("https://www.imdb.com/title/tt8217188/reviews")
yield SeleniumRequest(
url=c,
wait_time=4,
screenshot=True,
callback=self.parse)
def parse(self, response):
print("done")
Looks like you didnt bother to read docs
https://github.com/clemfromspace/scrapy-selenium
Anyways, try this code
from scrapy_selenium import SeleniumRequest
class MyBotSpider(scrapy.Spider):
name = 'mybot'
custom_settings = {
'SELENIUM_DRIVER_TYPE': 'executable',
'SELENIUM_DRIVER_NAME': 'chrome',
'SELENIUM_DRIVER_EXECUTABLE_PATH': r'c:\chromedriver.exe',
'SELENIUM_DRIVER_ARGUMENTS':[],
'DOWNLOADER_MIDDLEWARES': {
'scrapy_selenium.SeleniumMiddleware': 800
},
}
def start_requests(self):
yield SeleniumRequest(
url="https://www.imdb.com/title/tt8217188/reviews",
wait_time=4,
screenshot=True,
callback=self.parse)
def parse(self, response):
self.logger.info("Here")

Can't scrape next page contents using Scrapy

I want to scrape the contents from the next pages too but it didn't go to the next page. My code is:
import scrapy
class AggregatorSpider(scrapy.Spider):
name = 'aggregator'
allowed_domains = ['startech.com.bd/component/processor']
start_urls = ['https://startech.com.bd/component/processor']
def parse(self, response):
processor_details = response.xpath('//*[#class="col-xs-12 col-md-4 product-layout grid"]')
for processor in processor_details:
name = processor.xpath('.//h4/a/text()').extract_first()
price = processor.xpath('.//*[#class="price space-between"]/span/text()').extract_first()
print ('\n')
print (name)
print (price)
print ('\n')
next_page_url = response.xpath('//*[#class="pagination"]/li/a/#href').extract_first()
# absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_url)
I didn't use the urljoin because the next_page_url is giving me the whole url. I also tried the dont_filter=true argument in the yield function which gives me an infinite loop through the 1st page. The message I'm getting from the terminal is [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.startech.com.bd': https://www.startech.com.bd/component/processor?page=2>
This is because your allowed_domains variable is wrong, use allowed_domains = ['www.startech.com.bd'] instead (see the doc).
You can also modify your next page selector in order to avoid going to page one again:
import scrapy
class AggregatorSpider(scrapy.Spider):
name = 'aggregator'
allowed_domains = ['www.startech.com.bd']
start_urls = ['https://startech.com.bd/component/processor']
def parse(self, response):
processor_details = response.xpath('//*[#class="col-xs-12 col-md-4 product-layout grid"]')
for processor in processor_details:
name = processor.xpath('.//h4/a/text()').extract_first()
price = processor.xpath('.//*[#class="price space-between"]/span/text()').extract_first()
yield({'name': name, 'price': price})
next_page_url = response.css('.pagination li:last-child a::attr(href)').extract_first()
if next_page_url:
yield scrapy.Request(next_page_url)

Categories

Resources