INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) - python

I just began to learn Python and Scrapy.
My first project is to crawl information on a website containing web security information. But when I run that using cmd, it says that "Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)" and nothing seems to come out. I'd be grateful if someone kind could solve my problem.
My code:
import scrapy
class SapoSpider(scrapy.Spider):
name = "imo"
allowed_domains = ["imovirtual.com"]
start_urls = ["https://www.imovirtual.com/arrendar/apartamento/lisboa/"]
def parse(self,response):
subpage_links = []
for i in response.css('div.offer-item-details'):
youritem = {
'preco':i.css('span.offer-item title::text').extract_first(),
'autor':i.css('li.offer-item-price::text').extract(),
'data':i.css('li.offer-item-area::text').extract(),
'data_2':i.css('li.offer-item-price-perm::text').extract()
}
subpage_link = i.css('header[class=offer-item-header] a::attr(href)').extract()
subpage_links.extend(subpage_link)
for subpage_link in subpage_links:
yield scrapy.Request(subpage_link, callback=self.parse_subpage, meta={'item':youritem})
def parse_subpage(self,response):
for j in response.css('header[class=offer-item-header] a::attr(href)'):
youritem = response.meta.get('item')
youritem['info'] = j.css(' ul.dotted-list, li.h4::text').extract()
yield youritem

There are two things to correct to make it work:
You need to define FEED_URI setting with the path you want to store the result
You need to use response in parse_subpage because the logic is the following: scrapy downloads "https://www.imovirtual.com/arrendar/apartamento/lisboa/" and gives the response toparse, you extract ads url and you ask scrapy to download each pages and give the downloaded pages toparse_subpage. Soresponseinparse_subpage` corresponds to this https://www.imovirtual.com/anuncio/t0-totalmente-remodelado-localizacao-excelente-IDGBAY.html#913474cdaa for example
This should work:
import scrapy
class SapoSpider(scrapy.Spider):
name = "imo"
allowed_domains = ["imovirtual.com"]
start_urls = ["https://www.imovirtual.com/arrendar/apartamento/lisboa/"]
custom_settings = {
'FEED_URI': './output.json'
}
def parse(self,response):
subpage_links = []
for i in response.css('div.offer-item-details'):
youritem = {
'preco':i.css('span.offer-item title::text').extract_first(),
'autor':i.css('li.offer-item-price::text').extract(),
'data':i.css('li.offer-item-area::text').extract(),
'data_2':i.css('li.offer-item-price-perm::text').extract()
}
subpage_link = i.css('header[class=offer-item-header] a::attr(href)').extract()
subpage_links.extend(subpage_link)
for subpage_link in subpage_links:
yield scrapy.Request(subpage_link, callback=self.parse_subpage, meta={'item':youritem})
def parse_subpage(self,response):
youritem = response.meta.get('item')
youritem['info'] = response.css(' ul.dotted-list, li.h4::text').extract()
yield youritem

Related

Web crawler returns zero pages

I'm trying to create a spider from the results of this page
When I call the runspider function on the scraped results I get zero crawled pages returned.
There are further errors that I do not understand. I'm a beginner and this project is new to me.
My code it is:
`
import scrapy
class SusSpider(scrapy.Spider):
name = 'susManagement'
allowed_domains = ['in.gov.br/']
start_urls = ['https://www.in.gov.br/consulta/-/buscar/dou?q=*&s=do1&s=doe&exactDate=personalizado&sortType=0&delta=20&publishFrom=01%2F10%2F2021&publishTo=31%2F12%2F2021&orgPrin=Minist%C3%A9rio+da+Sa%C3%BAde']
def parse(self, response):
gates = response.xpath("//div[#class='resultado']//h5[#class='title-marker']//a")
for gate in gates:
gate_number = gate.xpath(".//text()").get()
link_gate = gate.xpath(".//#href").get()
yield response.follow(url=link_gate, callback=self.parse_text,
meta={'gate_name': gate_number})
next_page = response.xpath("//div//ul/li[#class='page-item active']/button")
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_text(self, response):
portaria = response.request.meta['gate_name']
num_portaria = response.xpath("*//section//div//p[#class='identifica']/text()").re('.*')
texto = response.xpath("//section//div//p[#class='texto-dou']/text()").re('.*')
ementa = response.xpath("//article//div//p[#class='ementa']/text()").re('.*')
rest_texto = texto - ementa - num_portaria
yield {
'port_name': portaria,
'numero_port': num_portaria,
'classified': ementa,
'texto_integral': rest_texto
}
`
I have tried to change the sequence of my parse functions. By clicking on each file after its finished.
At the moment I want to create a csv file with the documents of the output and separate them by columns.

Why my Scrapy Response.follow is not following the links and returns empty?

Below you can see my spider (in this case is just a test). As i test, i first run the parse_remedios as the main parse an it returns the results. But when i use as the second function (parse_remedios, like in the code below) the results are empty. I know that has something about the response.follow is not running well. Any ideas?
import scrapy
class RemediosSpider(scrapy.Spider):
name = 'remedios'
allowed_domains = ['www.drogariaspachecopacheco.com.br']
start_urls = ['https://www.drogariaspacheco.com.br/clorana%2025mg']
def parse(self, response):
print(response.url)
for link in response.css('.collection-link::attr(href)'):
yield response.follow(link.get(), callback=self.parse_remedios)
def parse_remedios(self, response):
resultado = response.css('.container-fluid')
yield {
'nome' : resultado.css('.productName::text').get(),
'preco' : resultado.css('.skuBestPrice::text').get() ,
'link' : response.url,
'sku' : resultado.css('.skuReference::text').get()
}
The problem is with your allowed_domains. Scrapy is filtering all of the links after the start_urls because they do not match any of the domains in your allowed_domains list. I have corrected in the example below
you can see it in the output logs
2022-09-15 21:38:24 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.drogariaspacheco.com.br': <GET https://www.drogariaspacheco.com.br/clorana-25mg-sanofi-aventis-30-comprimidos/p>
import scrapy
class RemediosSpider(scrapy.Spider):
name = 'remedios'
allowed_domains = ['drogariaspacheco.com.br']
start_urls = ['https://www.drogariaspacheco.com.br/clorana%2025mg']
def parse(self, response):
for link in response.css('.collection-link::attr(href)'):
yield response.follow(link.get(), callback=self.parse_remedios)
def parse_remedios(self, response):
resultado = response.css('.container-fluid')
yield {
'nome' : resultado.css('.productName::text').get(),
'preco' : resultado.css('.skuBestPrice::text').get() ,
'link' : response.url,
'sku' : resultado.css('.skuReference::text').get()
}

Scrapy-crawled-200 Referer-None

I'm trying to learn how to use scrapy and python but I'm not an expert at all...
I have an empty file after crawling this page :
so.news.com
and I don't understand why...
Here is my code :
import scrapy
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/']
def parse(self, response):
#titles = response.css('#newsCon > div.newsList > div.news > h2 > a::text').extract()
#date = response.css('#newsCon > div.newsList > div.news> div > p.newstime > span::text').extract()
titles = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/h2/a/text()").extract()
date = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/div[#class='easynews']/p[#class='newstime']/span/text()").extract()
for item in zip(titles,date):
scraped_info ={
"title" : item[0],
"date" : item[1],
}
yield scraped_info
nextPg = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='pagination']/a[#class='next']/#href").extract()
if nextPg is not None:
print(nextPg)
This is the messenage in console:
2020-05-11 00:09:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/> (referer: None)
[]
You need always check page's source code (Ctrl+U) in your browser. Content you see in your browser maybe loaded using XHR Javascript call. Here is code that works for me (I found correct start url using Chrome Developer Console):
import scrapy
import json
import re
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
# allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/getNews?keyWordAll=&keyWordOne=%25E6%2596%25B0%25E5%2586%25A0%2B%25E8%2582%25BA%25E7%2582%258E%2B%25E6%25AD%25A6%25E6%25B1%2589%2B%25E7%2597%2585%25E6%25AF%2592&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn&keyword=%E6%96%B0%E5%86%A0&curPage=1']
def parse(self, response):
data = json.loads(response.body)
for item in data["content"]["results"]:
scraped_info ={
"title" : item['title'],
"date" : item['pubtime'],
}
yield scraped_info
current_page = data['content']['curPage']
total_pages = data['content']['pageCount']
if current_page < total_pages:
next_page = re.sub(r'curPage=\d+', f"curPage={current_page + 1}", response.url)
yield scrapy.Request(
url=next_page,
callback=self.parse,
)

Problems getting next page when scraping with scrapy

I have a scrapy code which doesn't crawl pagination links and i'm stuck.
The source of the page is:
https://www.levenhuk.bg/katalog/teleskopi/?page=1
My code is:
import scrapy
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page=1']
download_delay = 3
def parse(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
# 'name': product.xpath('.//span[#itemprop="name" and contains(text(), "Levenhuk")]/text()').get(),
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
# 'price': product.xpath('.//div[#class="price"]/span/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
next_page_url = response.xpath('//*[#class="pagesCount"][1]//#href').get()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))
I feel like the problem is simply that you are not specifying a callback in your pagination request. Specify your parse function as callback and that should work. please comment if it still doesn't work.
Edit:
In this case I feel like your logic needs an overhaul. I suggest separating the pagination and item extraction login. Try the following:
def parse(self, response):
self.extract_item(response)
next_page_urls = response.xpath('//*[#class="pagesCount"] [1]//#href').getall()
if next_page_urls is not None:
for url in next_page_urls:
yield scrapy.Request(response.urljoin(url), callback=self.extract_item)
def extract_item(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
# 'name': product.xpath('.//span[#itemprop="name" and contains(text(), "Levenhuk")]/text()').get(),
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
# 'price': product.xpath('.//div[#class="price"]/span/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
so now the parse function handles pagination and the extract_item function extracts items for every page.
Modify allowed_domains as well as specified by Pasindu.
Change this to :
allowed_domains = ['https://www.levenhuk.bg/']
allowed_domains = ['levenhuk.bg']
You also need to change:
next_page_url = response.xpath('//*[#class="pagesCount"][1]//#href').get()
This will only work for the first page, for page 2,3,4.., this will extract a link to the first page.
And also add a callback as mentioned by UzairAhmed.
This is a little tricky since usually standard practice is to just check if there is a next page button on a loop until there isn't.
Here's an example since there is no next page button we can figure out the total page count. There will be a duplicate request to page1 though with this method its not the most ideal situation.
import scrapy
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page=1']
download_delay = 3
def parse(self, response):
total_pages = response.css('.pagesCount a::text')[-1].get()
total_pages = int(total_pages)
pages_str = str(total_pages)
for i in range(1, total_pages):
url = 'https://www.levenhuk.bg/katalog/teleskopi/?page={}'.format(pages_str)
yield scrapy.Request(url, callback=self.parse_item, dont_filter=True)
def parse_item(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
Another method of doing this would be to just look at how many pages there are and over ride your start_requests method as follows:
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page={}']
download_delay = 3
def start_requests(self):
for i in range(1, 14):
yield scrapy.Request(self.start_urls[0].format(str(i)), callback=self.parse)

How to scrape multiple pages from a website?

(Very) New to Python and programming in general
I've been trying to scrape data from more pages/section of the same website with Scrapy
My code works, but it's unreadable and not practical
import scrapy
class SomeSpider(scrapy.Spider):
name = 'some'
allowed_domains = ['https://example.com']
start_urls = [
'https://example.com/Python/?k=books&p=1',
'https://example.com/Python/?k=books&p=2',
'https://example.com/Python/?k=books&p=3',
'https://example.com/Python/?k=tutorials&p=1',
'https://example.com/Python/?k=tutorials&p=2',
'https://example.com/Python/?k=tutorials&p=3',
]
def parse(self, response):
response.selector.remove_namespaces()
info1 = response.css("scrapedinfo1").extract()
info2 = response.css("scrapedinfo2").extract()
for item in zip(scrapedinfo1, scrapedinfo2):
scraped_info = {
'scrapedinfo1': item[0],
'scrapedinfo2': item[1]}
yield scraped_info
How can I improve this?
I'd like to search within a certain amount of categories and pages
I need something like
categories = [books, tutorials, a, b, c, d, e, f]
in a range(1,3)
So that Scrapy would be able to do its job through all categories and pages, while being easy to edit and adapt to other websites
Any ideas are welcome
What I have tried:
categories = ["books", "tutorials"]
base = "https://example.com/Python/?k={category}&p={index}"
def url_generator():
for category, index in itertools.product(categories, range(1, 4)):
yield base.format(category=category, index=index)
But Scrapy returns
[scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min),
scraped 0 items (at 0 items/min)
Solved thanks to start_requests() and yield scrapy.Request()
Here's the code
import scrapy
import itertools
class SomeSpider(scrapy.Spider):
name = 'somespider'
allowed_domains = ['example.com']
def start_requests(self):
categories = ["books", "tutorials"]
base = "https://example.com/Python/?k={category}&p={index}"
for category, index in itertools.product(categories, range(1, 4)):
yield scrapy.Request(base.format(category=category, index=index))
def parse(self, response):
response.selector.remove_namespaces()
info1 = response.css("scrapedinfo1").extract()
info2 = response.css("scrapedinfo2").extract()
for item in zip(info1, info2):
scraped_info = {
'scrapedinfo1': item[0],
'scrapedinfo2': item[1],
}
yield scraped_info
You can use method start_requests() to generate urls at start using yield Request(url).
BTW: Later in parse() you can also use yield Request(url) to add new url.
I use portal toscrape.com which was created for testing spiders.
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['http://quotes.toqoute.com']
#start_urls = []
tags = ['love', 'inspirational', 'life', 'humor', 'books', 'reading']
pages = 3
url_template = 'http://quotes.toscrape.com/tag/{}/page/{}'
def start_requests(self):
for tag in self.tags:
for page in range(self.pages):
url = self.url_template.format(tag, page)
yield scrapy.Request(url)
def parse(self, response):
# test if method was executed
print('url:', response.url)
# --- run it without project ---
from scrapy.crawler import CrawlerProcess
#c = CrawlerProcess({
# 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
# 'FEED_FORMAT': 'csv',
# 'FEED_URI': 'output.csv',
#}
c = CrawlerProcess()
c.crawl(MySpider)
c.start()

Categories

Resources