Scrapy-crawled-200 Referer-None - python

I'm trying to learn how to use scrapy and python but I'm not an expert at all...
I have an empty file after crawling this page :
so.news.com
and I don't understand why...
Here is my code :
import scrapy
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/']
def parse(self, response):
#titles = response.css('#newsCon > div.newsList > div.news > h2 > a::text').extract()
#date = response.css('#newsCon > div.newsList > div.news> div > p.newstime > span::text').extract()
titles = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/h2/a/text()").extract()
date = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/div[#class='easynews']/p[#class='newstime']/span/text()").extract()
for item in zip(titles,date):
scraped_info ={
"title" : item[0],
"date" : item[1],
}
yield scraped_info
nextPg = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='pagination']/a[#class='next']/#href").extract()
if nextPg is not None:
print(nextPg)
This is the messenage in console:
2020-05-11 00:09:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/> (referer: None)
[]

You need always check page's source code (Ctrl+U) in your browser. Content you see in your browser maybe loaded using XHR Javascript call. Here is code that works for me (I found correct start url using Chrome Developer Console):
import scrapy
import json
import re
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
# allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/getNews?keyWordAll=&keyWordOne=%25E6%2596%25B0%25E5%2586%25A0%2B%25E8%2582%25BA%25E7%2582%258E%2B%25E6%25AD%25A6%25E6%25B1%2589%2B%25E7%2597%2585%25E6%25AF%2592&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn&keyword=%E6%96%B0%E5%86%A0&curPage=1']
def parse(self, response):
data = json.loads(response.body)
for item in data["content"]["results"]:
scraped_info ={
"title" : item['title'],
"date" : item['pubtime'],
}
yield scraped_info
current_page = data['content']['curPage']
total_pages = data['content']['pageCount']
if current_page < total_pages:
next_page = re.sub(r'curPage=\d+', f"curPage={current_page + 1}", response.url)
yield scrapy.Request(
url=next_page,
callback=self.parse,
)

Related

Why my Scrapy Response.follow is not following the links and returns empty?

Below you can see my spider (in this case is just a test). As i test, i first run the parse_remedios as the main parse an it returns the results. But when i use as the second function (parse_remedios, like in the code below) the results are empty. I know that has something about the response.follow is not running well. Any ideas?
import scrapy
class RemediosSpider(scrapy.Spider):
name = 'remedios'
allowed_domains = ['www.drogariaspachecopacheco.com.br']
start_urls = ['https://www.drogariaspacheco.com.br/clorana%2025mg']
def parse(self, response):
print(response.url)
for link in response.css('.collection-link::attr(href)'):
yield response.follow(link.get(), callback=self.parse_remedios)
def parse_remedios(self, response):
resultado = response.css('.container-fluid')
yield {
'nome' : resultado.css('.productName::text').get(),
'preco' : resultado.css('.skuBestPrice::text').get() ,
'link' : response.url,
'sku' : resultado.css('.skuReference::text').get()
}
The problem is with your allowed_domains. Scrapy is filtering all of the links after the start_urls because they do not match any of the domains in your allowed_domains list. I have corrected in the example below
you can see it in the output logs
2022-09-15 21:38:24 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.drogariaspacheco.com.br': <GET https://www.drogariaspacheco.com.br/clorana-25mg-sanofi-aventis-30-comprimidos/p>
import scrapy
class RemediosSpider(scrapy.Spider):
name = 'remedios'
allowed_domains = ['drogariaspacheco.com.br']
start_urls = ['https://www.drogariaspacheco.com.br/clorana%2025mg']
def parse(self, response):
for link in response.css('.collection-link::attr(href)'):
yield response.follow(link.get(), callback=self.parse_remedios)
def parse_remedios(self, response):
resultado = response.css('.container-fluid')
yield {
'nome' : resultado.css('.productName::text').get(),
'preco' : resultado.css('.skuBestPrice::text').get() ,
'link' : response.url,
'sku' : resultado.css('.skuReference::text').get()
}

Not being able to follow links in Scrapy

When I run my scrapy web crawler it is not following the pages to scrape the data in my code.
import scrapy
from ..items import YellowpagesItem
class YSpider(scrapy.Spider):
name = 'yp2'
allowed_domains = ['yellowpages.com']
start_urls = [
'https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=Conshohocken%2C+PA'
]
def parse(self, response):
for link in response.css('a.businesss-name'):
yield response.follow(link.attrib.get('href'), callback=self.parse_business)
def parse_business(self, response):
item = YellowpagesItem()
item['name'] = response.css('h1::text').get()
item['phone'] = response.css('p.phone::text').get()
item['street'] = response.css('h2 > span::text').get()
item['city_state'] = response.css('div.contact > h2.address::text').get()
item['tags'] = ','.join([item.get() for item in response.css('p.cats > a::text')])
item['email'] = response.css('a.email-business').attrib.get('href')
yield item
Simple typo on your selector, it should be a.business-name.
def parse(self, response):
for link in response.css('a.businesss-name'):
yield response.follow(link.attrib.get('href'), callback=self.parse_business)
If you do not know already, you can test your selector and follow function on Scrapy shell, this will avoid such situations, enter in your terminal:
scrapy shell 'https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=Conshohocken%2C+PA'
Then test the selectors like:
>> response.css('a.business-name')
<<
>> response.css('a.business-name')
<< [<Selector xpath="descendant-or-self:...>,...]
>> response.follow(response.css('a.business-name::attr(href)').get())
<< <GET https://www.yellowpages.com/conshohocken-pa/mip/tony-joes-pizzeria-10728468?lid=1002028703627>

Crawled but not scraped

I am trying to scrape using Scrapy the following website (https://www.leadhome.co.za/search/property-for-sale/western-cape/4?sort=date'), and I see that the page gets crawled but none of the items are returned. Everything works withing Scrapy Shell.
Here is the code I have:
class LeadHomeSpider(scrapy.Spider):
name = "lead_home"
start_urls = [
'https://www.leadhome.co.za/search/property-for-sale/western-cape/4?sort=date',
]
# parse search page
def parse(self, response):
# follow property link
offering = 'buy' if 'sale' in response.css('h1::text').get() else 'rent'
for prop in response.css('div.search__PropertyCardWrapper-sc-1j5dndx-0.bsqBpI'):
link = 'https://www.leadhome.co.za' + prop.css('a::attr(href)').get()
a = prop.css('p.styles__Label-h53xsw-16.bcSkCI::text').getall()
#prop_type = attempt_get_property_type(a[0]) if len(a) != 0 else None
area = a[1] if len(a) > 1 else None
yield scrapy.Request(
link,
meta={'item': {
'agency': self.name,
'url': link,
'area': area,
'offering': offering,
#'property_type': prop_type,
}},
callback=self.parse_property,
)
# follow to next page
next_page_number = response.xpath(
'//a[contains(#class, "styles__PageNumber-zln67a-0 jRCKhp")]/following-sibling::a/text()').get()
if next_page_number is not None:
new_page_link = 'https://www.leadhome.co.za/search/property-for-sale/western-cape/4?sort=date&page=' + next_page_number
next_page = response.urljoin(new_page_link)
yield scrapy.Request(next_page, callback=self.parse)
# parse property
def parse_property(self, response):
item = response.meta.get('item')
item['parking'] = response.xpath('//p[contains(text(), "Uncovered Parking:")]/following-sibling::p/text()').get()
...
Any idea what might be wrong here? Any suggestions are welcome! Thank you in advance!
You're using random class values (1j5dndx-0.bsqBpI etc) in your CSS expressions that's why your code don't work. Here is the same code but using XPath's contains to match a part of a class:
def parse(self, response):
# follow property link
offering = 'buy' if 'sale' in response.css('h1::text').get() else 'rent'
# for prop in response.css('div.search__PropertyCardWrapper-sc-1j5dndx-0.bsqBpI'):
for prop in response.xpath('//div[contains(#class, "search__PropertyCardWrapper-sc-")]'):
link = prop.xpath('.//a/#href').get()
# a = prop.css('p.styles__Label-h53xsw-16.bcSkCI::text').getall()
prop_type = prop.xpath('(.//p[contains(#class, "styles__Label-")])[1]/text()').get()
# area = a[1] if len(a) > 1 else None
link = response.urljoin(link)
yield scrapy.Request(
url=link,
meta={'item': {
'agency': self.name,
'url': link,
# 'area': area,
'offering': offering,
'property_type': prop_type,
}},
callback=self.parse_property,
)

INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)

I just began to learn Python and Scrapy.
My first project is to crawl information on a website containing web security information. But when I run that using cmd, it says that "Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)" and nothing seems to come out. I'd be grateful if someone kind could solve my problem.
My code:
import scrapy
class SapoSpider(scrapy.Spider):
name = "imo"
allowed_domains = ["imovirtual.com"]
start_urls = ["https://www.imovirtual.com/arrendar/apartamento/lisboa/"]
def parse(self,response):
subpage_links = []
for i in response.css('div.offer-item-details'):
youritem = {
'preco':i.css('span.offer-item title::text').extract_first(),
'autor':i.css('li.offer-item-price::text').extract(),
'data':i.css('li.offer-item-area::text').extract(),
'data_2':i.css('li.offer-item-price-perm::text').extract()
}
subpage_link = i.css('header[class=offer-item-header] a::attr(href)').extract()
subpage_links.extend(subpage_link)
for subpage_link in subpage_links:
yield scrapy.Request(subpage_link, callback=self.parse_subpage, meta={'item':youritem})
def parse_subpage(self,response):
for j in response.css('header[class=offer-item-header] a::attr(href)'):
youritem = response.meta.get('item')
youritem['info'] = j.css(' ul.dotted-list, li.h4::text').extract()
yield youritem
There are two things to correct to make it work:
You need to define FEED_URI setting with the path you want to store the result
You need to use response in parse_subpage because the logic is the following: scrapy downloads "https://www.imovirtual.com/arrendar/apartamento/lisboa/" and gives the response toparse, you extract ads url and you ask scrapy to download each pages and give the downloaded pages toparse_subpage. Soresponseinparse_subpage` corresponds to this https://www.imovirtual.com/anuncio/t0-totalmente-remodelado-localizacao-excelente-IDGBAY.html#913474cdaa for example
This should work:
import scrapy
class SapoSpider(scrapy.Spider):
name = "imo"
allowed_domains = ["imovirtual.com"]
start_urls = ["https://www.imovirtual.com/arrendar/apartamento/lisboa/"]
custom_settings = {
'FEED_URI': './output.json'
}
def parse(self,response):
subpage_links = []
for i in response.css('div.offer-item-details'):
youritem = {
'preco':i.css('span.offer-item title::text').extract_first(),
'autor':i.css('li.offer-item-price::text').extract(),
'data':i.css('li.offer-item-area::text').extract(),
'data_2':i.css('li.offer-item-price-perm::text').extract()
}
subpage_link = i.css('header[class=offer-item-header] a::attr(href)').extract()
subpage_links.extend(subpage_link)
for subpage_link in subpage_links:
yield scrapy.Request(subpage_link, callback=self.parse_subpage, meta={'item':youritem})
def parse_subpage(self,response):
youritem = response.meta.get('item')
youritem['info'] = response.css(' ul.dotted-list, li.h4::text').extract()
yield youritem

How to scrape multiple pages from a website?

(Very) New to Python and programming in general
I've been trying to scrape data from more pages/section of the same website with Scrapy
My code works, but it's unreadable and not practical
import scrapy
class SomeSpider(scrapy.Spider):
name = 'some'
allowed_domains = ['https://example.com']
start_urls = [
'https://example.com/Python/?k=books&p=1',
'https://example.com/Python/?k=books&p=2',
'https://example.com/Python/?k=books&p=3',
'https://example.com/Python/?k=tutorials&p=1',
'https://example.com/Python/?k=tutorials&p=2',
'https://example.com/Python/?k=tutorials&p=3',
]
def parse(self, response):
response.selector.remove_namespaces()
info1 = response.css("scrapedinfo1").extract()
info2 = response.css("scrapedinfo2").extract()
for item in zip(scrapedinfo1, scrapedinfo2):
scraped_info = {
'scrapedinfo1': item[0],
'scrapedinfo2': item[1]}
yield scraped_info
How can I improve this?
I'd like to search within a certain amount of categories and pages
I need something like
categories = [books, tutorials, a, b, c, d, e, f]
in a range(1,3)
So that Scrapy would be able to do its job through all categories and pages, while being easy to edit and adapt to other websites
Any ideas are welcome
What I have tried:
categories = ["books", "tutorials"]
base = "https://example.com/Python/?k={category}&p={index}"
def url_generator():
for category, index in itertools.product(categories, range(1, 4)):
yield base.format(category=category, index=index)
But Scrapy returns
[scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min),
scraped 0 items (at 0 items/min)
Solved thanks to start_requests() and yield scrapy.Request()
Here's the code
import scrapy
import itertools
class SomeSpider(scrapy.Spider):
name = 'somespider'
allowed_domains = ['example.com']
def start_requests(self):
categories = ["books", "tutorials"]
base = "https://example.com/Python/?k={category}&p={index}"
for category, index in itertools.product(categories, range(1, 4)):
yield scrapy.Request(base.format(category=category, index=index))
def parse(self, response):
response.selector.remove_namespaces()
info1 = response.css("scrapedinfo1").extract()
info2 = response.css("scrapedinfo2").extract()
for item in zip(info1, info2):
scraped_info = {
'scrapedinfo1': item[0],
'scrapedinfo2': item[1],
}
yield scraped_info
You can use method start_requests() to generate urls at start using yield Request(url).
BTW: Later in parse() you can also use yield Request(url) to add new url.
I use portal toscrape.com which was created for testing spiders.
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['http://quotes.toqoute.com']
#start_urls = []
tags = ['love', 'inspirational', 'life', 'humor', 'books', 'reading']
pages = 3
url_template = 'http://quotes.toscrape.com/tag/{}/page/{}'
def start_requests(self):
for tag in self.tags:
for page in range(self.pages):
url = self.url_template.format(tag, page)
yield scrapy.Request(url)
def parse(self, response):
# test if method was executed
print('url:', response.url)
# --- run it without project ---
from scrapy.crawler import CrawlerProcess
#c = CrawlerProcess({
# 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
# 'FEED_FORMAT': 'csv',
# 'FEED_URI': 'output.csv',
#}
c = CrawlerProcess()
c.crawl(MySpider)
c.start()

Categories

Resources