No Pages getting crawled - scrapy - python

The following spider code in Scrapy was developed to be used to crawl pages from americanas website:
# -*- coding: utf-8 -*-
import scrapy
import urllib
import re
import webscrape.items
import time
from urlparse import urljoin
from HTMLParser import HTMLParser
class AmericanasSpider(scrapy.Spider):
name = "americanas"
start_urls = ('http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-menuLista-alimentos/',)
source = webscrape.items.ImportSource ("Americanas")
def parse (self, response):
ind = 0
self.source.submit()
b = []
for c in response.xpath ('//div[#class="item-menu"]/ul'):
c1 = re.sub('[\t\n]','', c.xpath('//span [#class="menu-heading"]/text()').extract()[ind])
if (c1):
x = webscrape.items.Category(c1)
x.submit()
for b in c.xpath ('li'):
b1 = webscrape.items.Category( b.xpath('a/text()').extract()[0])
if (b1):
b1.setParent(x.getID())
b1.submit()
link = b.xpath ('#href').extract()
urla = urljoin (response.url, link)
request = scrapy.Request (urla, callback = self.parse_category)
request.meta['idCategory'] = b1.getID ()
yield request
for a in b.xpath ('ul/li/a/text()'):
a1 = webscrape.items.Category( a.extract())
a1.setParent(b1.getID())
a1.submit()
link = a.xpath ('#href').extract()
urla = urljoin (response.url, link)
request = scrapy.Request (urla, callback = self.parse_category)
request.meta['idCategory'] = a1.getID ()
yield request
ind = ind + 1
def parse_category(self, response):
# produtos na pagina
items = response.xpath('//div[#class="paginado"]//article[#class="single-product vitrine230 "]')
for item in items:
url = item.xpath('.//div[#itemprop="item"]/form/div[#class="productInfo"]/div]/a[#class="prodTitle"]/#href').extract()
urla = urljoin(response.url, link)
request = scrapy.Request (urla, callback = self.parse_product)
request.meta['idCategory'] = response.meta['idCategory']
yield request
# proxima pagina (caso exista)
nextpage = response.xpath('//div[#class="pagination"]/ul/li/a[#class="pure-button next"]/#href').extract()
if (nextpage):
link = nextpage[0]
urlb = urljoin(response.url, link)
self.log('Next Page: {0}'.format(nextpage))
request = scrapy.Request (urlb, callback = self.parse_category)
request.meta['idCategory'] = response.meta['idCategory']
yield request
def parse_product (self, response):
print response.url
title = response.xpath('//title/text()').extract()
self.log(u'Título: {0}'.format(title))
but i get the following output:
PS C:\Users\Natalia Oliveira\Desktop\Be Happy\behappy\import\webscrape> scrapy crawl americanas
2016-10-06 17:28:04 [scrapy] INFO: Scrapy 1.1.2 started (bot: webscrape)
2016-10-06 17:28:04 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'webscrape.spiders', 'REDIRECT_ENABLED': Fal
se, 'SPIDER_MODULES': ['webscrap.spiders'], 'BOT_NAME': 'webscrape'}
2016-10-06 17:28:04 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2016-10-06 17:28:05 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-10-06 17:28:05 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-10-06 17:28:05 [scrapy] INFO: Enabled item pipelines:
[]
2016-10-06 17:28:05 [scrapy] INFO: Spider opened
2016-10-06 17:28:05 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-10-06 17:28:05 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-10-06 17:28:05 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.m
c_id=home-menuLista-alimentos/> (referer: None)
2016-10-06 17:28:07 [scrapy] DEBUG: Filtered duplicate request: <GET http://www.americanas.com.br/loja/226795/alimentos-
e-bebidas?WT.mc_id=home-menuLista-alimentos/> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all dupli
cates)
2016-10-06 17:28:07 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.m
c_id=home-menuLista-alimentos/> (referer: http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-men
uLista-alimentos/)
2016-10-06 17:28:22 [scrapy] INFO: Closing spider (finished)
2016-10-06 17:28:22 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 931,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 80585,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'dupefilter/filtered': 60,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 10, 6, 20, 28, 22, 257000),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2016, 10, 6, 20, 28, 5, 346000)}
2016-10-06 17:28:22 [scrapy] INFO: Spider closed (finished)
I really don't know what is wrong here, because i'm a beginner in scrapy. Here's the wrong point?
The def parse is running as expected, so, I think the error should be in def parse_category or parse_product methods.

Your xpath is not correct and there is only one item-menu per page, i have removed the items logic as I don't know what they are. This will get you all the links from the item-menu ul, you can add back in whatever logic you like:
def parse(self, response):
for url in response.xpath('//div[#class="item-menu"]/ul/li[#class="item-linha"]/a/#href').extract():
if not url.startswith("http"):
url = response.urljoin(url)
request = scrapy.Request(url, callback=self.parse_category)
request.meta['idCategory'] = url # add whatever here
yield request
You next method is also over complicated, you don't need to worry about anything but the anchor tags with the prodTitle class:
def parse_category(self, response):
# produtos na pagina
urls = response.css('a.prodTitle::attr(href)').extract()
for url in urls:
request = scrapy.Request(url, callback=self.parse_product)
request.meta['idCategory'] = response.meta['idCategory']
yield request
# you want to check for the anchor with "Próxima" text
nextpage = response.xpath(u'//ul[#class="pure-paginator acrN"]/li/a[contains(.,"Próxima")]/#href').extract_first()
if nextpage:
self.log(u'Next Page: {0}'.format(nextpage))
request = scrapy.Request(nextpage, callback=self.parse_category)
request.meta['idCategory'] = response.meta['idCategory']
yield request
def parse_product(self, response):
print response.url
title = response.xpath('//title/text()').extract_first()
self.log(u'Título: {0}'.format(title))
If you run it now you will see lots of output like:
2016-10-06 23:25:15 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/314061/alimentos-e-bebidas/biscoitos?ofertas.offset=30
2016-10-06 23:25:15 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/342151/alimentos-e-bebidas/azeite-e-vinagre?ofertas.offset=30
2016-10-06 23:25:15 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/342129/alimentos-e-bebidas/barra-de-cereais?ofertas.offset=30
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/15815078/nan-comfor-1-formula-infantil-nestle-lata-800g> (referer: http://www.americanas.com.br/linha/314080/alimentos-e-bebidas/alimentacao-infantil)
http://www.americanas.com.br/produto/15815078/nan-comfor-1-formula-infantil-nestle-lata-800g
2016-10-06 23:25:16 [americanas] DEBUG: Título: Nan Comfor 1 Fórmula Infantil Nestlé Lata 800g - Americanas.com
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/316829/eletrodomesticos/adega-de-vinho> (referer: http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-menuLista-alimentos/)
2016-10-06 23:25:16 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/316829/eletrodomesticos/adega-de-vinho?ofertas.offset=30
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/7170286/goiabada-135g-diet-house> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/9955598/adocante-em-sache-fit-caixa-com-30-unidades-de-2-5g-uniao> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/285368/utilidades-domesticas/vinho> (referer: http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-menuLista-alimentos/)
http://www.americanas.com.br/produto/7170286/goiabada-135g-diet-house
2016-10-06 23:25:16 [americanas] DEBUG: Título: Goiabada 135g - Diet House - Americanas.com
http://www.americanas.com.br/produto/9955598/adocante-em-sache-fit-caixa-com-30-unidades-de-2-5g-uniao
2016-10-06 23:25:16 [americanas] DEBUG: Título: Adoçante Em Sache Fit Caixa Com 30 Unidades De 2,5g União - Americanas.com
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/121047374/barra-de-chocolate-ao-leite-lacta-150g-1-unidade> (referer: http://www.americanas.com.br/linha/314045/alimentos-e-bebidas/bomboniere)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/314080/alimentos-e-bebidas/alimentacao-infantil?ofertas.offset=30> (referer: http://www.americanas.com.br/linha/314080/alimentos-e-bebidas/alimentacao-infantil)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce?ofertas.offset=30> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/9800047/acucar-refinado-caixa-com-400-envelopes-x-5g-uniao-premium> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
http://www.americanas.com.br/produto/121047374/barra-de-chocolate-ao-leite-lacta-150g-1-unidade
2016-10-06 23:25:16 [americanas] DEBUG: Título: Barra de Chocolate Ao leite Lacta 150g - 1 unidade - Americanas.com

Related

Web scraping pagination with scrapy

I'm trying web scraping with scrapy. But I got "duplicates" warning. Can't jump next page.
How can I scrape all pages with pagination?
example site: teknosa.com
scraping url: https://www.teknosa.com/bilgisayar-tablet-c-116
pagination structure: ?s=%3Arelevance&page=0 (1,2,3,4,5, and more..)
My pagination code:
next_page = soup.find('button', {'title': 'Daha Fazla Ürün Gör'})['data-gotohref']
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
You can make the pagination in start_urls and increase or decrease range of page numbers.
import scrapy
from scrapy.crawler import CrawlerProcess
class CarsSpider(scrapy.Spider):
name = 'car'
start_urls=['https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page='+str(x)+'' for x in range(1,11)]
def parse(self, response):
print(response.url)
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl()
process.start()
Output:
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=1
2022-05-01 08:55:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=2> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=2
2022-05-01 08:55:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=5> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=5
2022-05-01 08:55:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=6> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=6
2022-05-01 08:55:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=7> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=7
2022-05-01 08:55:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=3> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=3
2022-05-01 08:55:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=4> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=4
2022-05-01 08:55:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=8> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=8
2022-05-01 08:55:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=9> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=9
2022-05-01 08:55:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=10> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=10
Multiple urls, pagination using for loop
import scrapy
class CarsSpider(scrapy.Spider):
name = 'car'
def start_requests(self):
urls=['url_1', 'url_2', 'url_3', ...]
for url in urls:
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
...

CrawlSpider callback not working, next page is retrieved once

I am trying to perform a horizontal crawling -- starting from the first page and move until reaching the last page. The code is as follows:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from scrapy_project.items import metacriticItem
import datetime
# xpaths
main_xpath = '//div[#class = "title_bump"]//td[#class = "clamp-summary-wrap"]'
mt_url_xpath = './a/#href'
class MovieUrlSpider(CrawlSpider):
name = 'movie_url'
allowed_domains = ['web']
start_urls = (
'https://www.metacritic.com/browse/movies/score/metascore/all',
)
# rules for horizontal crawling
rules = (
Rule(LinkExtractor(restrict_xpaths='//a[#rel="next"]'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
# list of items we want
main = response.xpath(main_xpath)
for i in main:
# create the loader using the response
l = ItemLoader(item = metacriticItem(), selector = i)
# key
l.add_xpath('mt_url', mt_url_xpath)
# housekeeping fields
l.add_value('url', response.url)
l.add_value('spider', self.name)
l.add_value('date', datetime.datetime.now().strftime('%d/%m/%Y'))
yield l.load_item()
This script did not parse any item and it return the following message:
2021-05-12 02:22:24 [scrapy.core.engine] INFO: Spider opened
2021-05-12 02:22:24 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2021-05-12 02:22:24 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2021-05-12 02:22:24 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/robots.txt> (referer: None)
2021-05-12 02:22:24 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.metacritic.com/browse/movies/score/metascore/all> (referer: None)
2021-05-12 02:22:24 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.metacritic.com': <GET https://www.metacritic.com/browse/movies/score/metascore/all?page=1>
2021-05-12 02:22:24 [scrapy.core.engine] INFO: Closing spider (finished)

Scrapy not pulling closing prices from Yahoo! Finance

I'm trying to pull closing prices and percentage changes for three tickers from Yahoo! Finance using Scrapy. However, I get no data even though I've confirmed my XPaths work and get me to the right place on the actual page, using the console in Chrome. Could someone let me know what might be happening here?
items.py:
from scrapy.item import Item, Field
class InvestmentItem(Item):
ticker = Field()
closing_px = Field()
closing_pct = Field()
investment_spider.py
from scrapy import Spider
from scrapy.selector import Selector
from investment.items import InvestmentItem
class InvestmentSpider(Spider):
name = "investment"
allowed_domains = ["finance.yahoo.com"]
start_urls = ["https://finance.yahoo.com/quote/SPY?p=SPY", "https://finance.yahoo.com/quote/DIA?p=DIA", "https://finance.yahoo.com/quote/QQQ?p=QQQ"]
def parse(self, response):
results = Selector(response).xpath('//div[#class="D(ib) Mend(20px)"]')
for result in results:
item = InvestmentItem()
item['closing_px'] = result.xpath('//span[#class="Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)"]/text()').extract()[0]
item['closing_pct'] = result.xpath('//span[#class="Trsdu(0.3s) Fw(500) Pstart(10px) Fz(24px) C($dataRed)"]/text()').extract()[0]
yield item
output from console:
2020-03-22 23:42:26 [scrapy.utils.log] INFO: Scrapy 2.0.0 started (bot: investment)
2020-03-22 23:42:26 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.8.2 (v3.8.2:7b3ab5921f, Feb 24 2020, 17:52:18) - [Clang 6.0 (clang-600.0.57)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d 10 Sep 2019), cryptography 2.8, Platform macOS-10.15.3-x86_64-i386-64bit
2020-03-22 23:42:26 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-03-22 23:42:26 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'investment',
'NEWSPIDER_MODULE': 'investment.spiders',
'ROBOTSTXT_OBEY': True,
'SPIDER_MODULES': ['investment.spiders']}
2020-03-22 23:42:26 [scrapy.extensions.telnet] INFO: Telnet Password: 4d82e058cd5967c1
2020-03-22 23:42:26 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2020-03-22 23:42:26 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2020-03-22 23:42:26 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2020-03-22 23:42:26 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2020-03-22 23:42:26 [scrapy.core.engine] INFO: Spider opened
2020-03-22 23:42:26 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-03-22 23:42:26 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-03-22 23:42:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://finance.yahoo.com/robots.txt> (referer: None)
2020-03-22 23:42:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://finance.yahoo.com/quote/SPY?p=SPY> (referer: None)
2020-03-22 23:42:27 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://finance.yahoo.com/quote/QQQ?p=QQQ> (referer: None)
2020-03-22 23:42:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://finance.yahoo.com/quote/DIA?p=DIA> (referer: None)
2020-03-22 23:42:29 [scrapy.core.engine] INFO: Closing spider (finished)
2020-03-22 23:42:29 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 923,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 4,
'downloader/response_bytes': 495443,
'downloader/response_count': 4,
'downloader/response_status_count/200': 4,
'elapsed_time_seconds': 2.296482,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2020, 3, 23, 3, 42, 29, 66553),
'log_count/DEBUG': 4,
'log_count/INFO': 10,
'memusage/max': 48963584,
'memusage/startup': 48963584,
'response_received_count': 4,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2020, 3, 23, 3, 42, 26, 770071)}
2020-03-22 23:42:29 [scrapy.core.engine] INFO: Spider closed (finished)
Thanks in advance!
Required pages are dynamically rendered by React.
The required information is inside the script tag and the variable root.App.main
To receive info i use this docs.
Another option is to use Splash or Selenium rendering.
Working example:
from scrapy import Spider
from scrapy.selector import Selector
from investment.items import InvestmentItem
import json
class InvestmentSpider(Spider):
name = "investment"
allowed_domains = ["finance.yahoo.com"]
start_urls = ["https://finance.yahoo.com/quote/SPY?p=SPY", "https://finance.yahoo.com/quote/DIA?p=DIA", "https://finance.yahoo.com/quote/QQQ?p=QQQ"]
def parse(self, response):
pattern = r'\broot\.App\.main\s*=\s*(\{.*?\})\s*;\s*\n'
json_data = response.css('script::text').re_first(pattern)
price = json.loads(json_data)['context']['dispatcher']['stores']['QuoteSummaryStore']['price']
item = InvestmentItem()
item['closing_px'] = price['regularMarketPrice']['fmt']
item['closing_pct'] = price['regularMarketChange']['fmt']
yield item
hope this will help you
class InvestmentSpider(Spider):
name = "investment"
def start_requests(self):
urls = ["https://finance.yahoo.com/quote/SPY?p=SPY", "https://finance.yahoo.com/quote/DIA?p=DIA", "https://finance.yahoo.com/quote/QQQ?p=QQQ"]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
data = response.xpath('//div[#class="D(ib) Mend(20px)"]/span/text()').extract()
item = InvestmentItem()
item['closing_px'] = data[0] #1st span
item['closing_pct'] = data[1] #2nd span
yield item

How to keep a session authenticated while crawling with scrapy?

I have followed the response for this question (Crawling with an authenticated session in Scrapy) to use scrapy with an authenticated session. The problem is that it seems to log in successfully, but when I make a request it seems to be unauthenticated.
Any idea on where the problem is?
Here is my python script:
enter code here
import scrapy
from scrapy.spiders.init import InitSpider
from scrapy.utils.response import open_in_browser
class LoginSpider(InitSpider):
name = demo
login_page = #login page
inquery = #search query
start_urls = #urls with queries
def init_request(self):
return scrapy.Request(url=self.login_page, callback=self.login)
def login(self, response):
open_in_browser(response)
return [scrapy.FormRequest.from_response(response,
formid='login-form',
formdata={'username': 'username', 'password': 'password'},
callback=self.after_login)]
def after_login(self, response):
# check login succeed before going on
open_in_browser(response)
if "invalid username or password" in response.body:
self.log("Login failed", level=log.ERROR)
print "FAILED"
return
else:
self.log('authentication succeed')
return scrapy.Request(url=self.inquery, callback=self.parsse)
def parsse(self, response):
for result in response.xpath('//div[#class="span9"]/div[#class="search-result"]/div/a[#class="details"]/#href'):
print 'new resutl'
url = response.urljoin(result.extract())
yield scrapy.Request(url, callback=self.parse_details_contents)
def parse_details_contents(self, response):
item = ShodanItem()
for details in response.xpath('//ul[#class="services"]/li'):
item['ip'] = response.xpath('/html/body/div[3]/div/div[2]/div/div[1]/div/h2/text()').extract()
item['services_arr'][0] = details.xpath('/div[1]/div[1]/text()').extract()
item['services_arr'][1] = details.xpath('/div[1]/div[2]/text()').extract()
item['services_arr'][2] = details.xpath('/div[1]/div[3]/text()').extract()
item['services_arr'][3] = details.xpath('/div[2]/h3/text()').extract()
item['services_arr'][4] = details.xpath('/div[2]/pre/text()').extract()
print item['services_arr'][4]
yield item
Here is the log, I assume it does log in as it redirects to the main page, but afterwards, using the open_in_browser() command I get a page that asks for authentication in order to use the query:
dsds
2016-07-06 15:07:51 [scrapy] INFO: Spider opened
2016-07-06 15:07:51 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-07-06 15:07:51 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-07-06 15:07:52 [scrapy] DEBUG: Crawled (404) <GET https://account.shodan.io/robots.txt> (referer: None)
2016-07-06 15:07:52 [scrapy] DEBUG: Crawled (200) <GET https://account.shodan.io/login> (referer: None)
2016-07-06 15:07:52 [scrapy] DEBUG: Redirecting (302) to <GET https://www.shodan.io/?language=en> from <POST https://account.shodan.io/login>
2016-07-06 15:07:53 [scrapy] DEBUG: Crawled (200) <GET https://www.shodan.io/robots.txt> (referer: None)
2016-07-06 15:07:53 [scrapy] DEBUG: Crawled (200) <GET https://www.shodan.io/?language=en> (referer: https://account.shodan.io/login)
2016-07-06 15:07:53 [shodan.io] DEBUG: authentication succeed
2016-07-06 15:07:54 [scrapy] DEBUG: Crawled (200) <GET https://www.shodan.io/search?query=org%3A%22Instituto+Tecnol%C3%B3gico+y+de+Estudios+Superiores+de%22> (referer: https://www.shodan.io/?language=en)
ASDASDASDASDASDASDASDASDASD
2016-07-06 15:07:54 [scrapy] INFO: Closing spider (finished)
2016-07-06 15:07:54 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2231,
'downloader/request_count': 6,
'downloader/request_method_count/GET': 5,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 11759,
'downloader/response_count': 6,
'downloader/response_status_count/200': 4,
'downloader/response_status_count/302': 1,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 6, 20, 7, 54, 214825),
'log_count/DEBUG': 8,
'log_count/INFO': 7,
'request_depth_max': 2,
'response_received_count': 5,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'start_time': datetime.datetime(2016, 7, 6, 20, 7, 51, 797093)}
I found by enabling the cookies debugger COOKIES_DEBUG = True in the configuration file that the authentication was being done correctly. By messing with the configuration file, I managed to make the script work and crawl the site authenticated.
For anyone with the same problem, I addedROBOTSTXT_OBEY = False and changed the user agent to the one of a web browser: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36

Scraping Data from a table with Scrapy but not Scraped Items

I'm trying out Scrapy for first time.After doing fair bit of research I got the basics. Now I was trying to get data of a table. It isn't working. not Scraped any data. Check below for source codes.
settings.py
BOT_NAME = 'car'
SPIDER_MODULES = ['car.spiders']
NEWSPIDER_MODULE ='car.spiders'
DEFAULT_ITEM_CLASS = 'car.items.Car58Item'
ITEM_PIPELINES = {'car.pipelines.JsonLinesItemExporter': 300}
items.py
from scrapy.item import Item,Field
class Car58Item(Item):
# define the fields for your item here like:
# name = scrapy.Field()
url = Field()
tip = Field()
name = Field()
size = Field()
region = Field()
amt = Field()
car_spider.py
# -*- coding=utf-8 -*-
from __future__ import absolute_import
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule,Spider
from car.items import Car58Item
class CarSpider (CrawlSpider):
name ='car'
allowed_domains = ['58.com']
start_urls = ['http://quanguo.58.com/ershouche']
rules = [Rule(LinkExtractor(allow=('/pn\d+')),'parse_item')] #//页面读取策略
def parse_item(self,response):
trs = response.xpath("//div[#id='infolist']/table[#class='tbimg']/tr")[1:-2]
items = []
#for tr in sel.xpath("id('infolist')/table/tr"):
for tr in trs:
item = Car58Item()
item['url'] = tr.xpath("td[#class='img']/a/#href").extract()
item['tip'] = tr.xpath("td[#class='t']/a/font/text()").extract()
item['name'] = tr.xpath("td[#class='t']/a[1]/text()").extract()
item['size'] = tr.xpath("td[#class='t']/p").extract()
item['region'] = tr.xpath("td[#class='tc']/a/text()").extract()
item['amt'] = tr.xpath("td[#class='tc']/b/text()").extract()
items.append(item)
return items
pipelines.py
# -*- coding: utf-8 -*-
import json
import codecs
class JsonLinesItemExporter(object):
def __init__(self):
self.file = codecs.open('car.json','w',encoding='utf-8')
def process_item(self, items, spider):
line = json.dumps(dict(items),ensure_ascii=False) + "\n"
self.file.write(line)
return items
def spider_closed(self,spider):
self.file.close()
i running scrapy in shell
[mayuping#i~/PycharmProjects/car] $**scrapy crawl car**
2016-05-18 10:35:36 [scrapy] INFO: Scrapy 1.0.6 started (bot: car)
2016-05-18 10:35:36 [scrapy] INFO: Optional features available: ssl, http11
2016-05-18 10:35:36 [scrapy] INFO: Overridden settings: {'DEFAULT_ITEM_CLASS': 'car.items.Car58Item', 'NEWSPIDER_MODULE': 'car.spiders', 'SPIDER_MODULES': ['car.spiders'], 'BOT_NAME': 'car'}
2016-05-18 10:35:36 [scrapy] INFO: Enabled extensions: CloseSpider, TelnetConsole, LogStats, CoreStats, SpiderState
2016-05-18 10:35:36 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2016-05-18 10:35:36 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2016-05-18 10:35:36 [scrapy] INFO: Enabled item pipelines: JsonLinesItemExporter
2016-05-18 10:35:36 [scrapy] INFO: Spider opened
2016-05-18 10:35:36 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-05-18 10:35:36 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-05-18 10:35:37 [scrapy] DEBUG: Redirecting (301) to <GET http://quanguo.58.com/ershouche/> from <GET http://quanguo.58.com/ershouche>
2016-05-18 10:35:39 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/> (referer: None)
2016-05-18 10:35:40 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn2/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:42 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn7/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:42 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn6/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:42 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn12/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:43 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn11/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:44 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn9/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:45 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn8/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:45 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn5/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:46 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn10/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:46 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn4/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:46 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn3/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:47 [scrapy] INFO: Closing spider (finished)
2016-05-18 10:35:47 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 5550,
'downloader/request_count': 13,
'downloader/request_method_count/GET': 13,
'downloader/response_bytes': 339809,
'downloader/response_count': 13,
'downloader/response_status_count/200': 12,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 5, 18, 2, 35, 47, 45187),
'log_count/DEBUG': 14,
'log_count/INFO': 7,
'request_depth_max': 1,
'response_received_count': 12,
'scheduler/dequeued': 13,
'scheduler/dequeued/memory': 13,
'scheduler/enqueued': 13,
'scheduler/enqueued/memory': 13,
'start_time': datetime.datetime(2016, 5, 18, 2, 35, 36, 733155)}
2016-05-18 10:35:47 [scrapy] INFO: Spider closed (finished)
but not scrapy any data...
[mayuping#i~/PycharmProjects/car] $more car.json
zero items output in car.json.
thanks
I problem is solved. parse_item is not indented in car_spider.py. after indented:
# -*- coding=utf-8 -*-
from __future__ import absolute_import
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule,Spider
from car.items import Car58Item
class CarSpider (CrawlSpider):
name ='car'
allowed_domains = ['58.com']
start_urls = ['http://quanguo.58.com/ershouche']
rules = [Rule(LinkExtractor(allow=('/pn\d+')),'parse_item')] #//页面读取策略
def parse_item(self,response):
trs = response.xpath("//div[#id='infolist']/table[#class='tbimg']/tr")[1:-2]
items = []
#for tr in sel.xpath("id('infolist')/table/tr"):
for tr in trs:
item = Car58Item()
item['url'] = tr.xpath("td[#class='img']/a/#href").extract()
item['tip'] = tr.xpath("td[#class='t']/a/font/text()").extract()
item['name'] = tr.xpath("td[#class='t']/a[1]/text()").extract()
# item['description'] = trs.xpath("/td[#class='t']/a/text()").extract()
item['size'] = tr.xpath("td[#class='t']/p").extract()
item['region'] = tr.xpath("td[#class='tc']/a/text()").extract()
item['amt'] = tr.xpath("td[#class='tc']/b/text()").extract()
items.append(item)
return items
scrapy's built in JSON export is OK.

Categories

Resources