How to keep a session authenticated while crawling with scrapy? - python

I have followed the response for this question (Crawling with an authenticated session in Scrapy) to use scrapy with an authenticated session. The problem is that it seems to log in successfully, but when I make a request it seems to be unauthenticated.
Any idea on where the problem is?
Here is my python script:
enter code here
import scrapy
from scrapy.spiders.init import InitSpider
from scrapy.utils.response import open_in_browser
class LoginSpider(InitSpider):
name = demo
login_page = #login page
inquery = #search query
start_urls = #urls with queries
def init_request(self):
return scrapy.Request(url=self.login_page, callback=self.login)
def login(self, response):
open_in_browser(response)
return [scrapy.FormRequest.from_response(response,
formid='login-form',
formdata={'username': 'username', 'password': 'password'},
callback=self.after_login)]
def after_login(self, response):
# check login succeed before going on
open_in_browser(response)
if "invalid username or password" in response.body:
self.log("Login failed", level=log.ERROR)
print "FAILED"
return
else:
self.log('authentication succeed')
return scrapy.Request(url=self.inquery, callback=self.parsse)
def parsse(self, response):
for result in response.xpath('//div[#class="span9"]/div[#class="search-result"]/div/a[#class="details"]/#href'):
print 'new resutl'
url = response.urljoin(result.extract())
yield scrapy.Request(url, callback=self.parse_details_contents)
def parse_details_contents(self, response):
item = ShodanItem()
for details in response.xpath('//ul[#class="services"]/li'):
item['ip'] = response.xpath('/html/body/div[3]/div/div[2]/div/div[1]/div/h2/text()').extract()
item['services_arr'][0] = details.xpath('/div[1]/div[1]/text()').extract()
item['services_arr'][1] = details.xpath('/div[1]/div[2]/text()').extract()
item['services_arr'][2] = details.xpath('/div[1]/div[3]/text()').extract()
item['services_arr'][3] = details.xpath('/div[2]/h3/text()').extract()
item['services_arr'][4] = details.xpath('/div[2]/pre/text()').extract()
print item['services_arr'][4]
yield item
Here is the log, I assume it does log in as it redirects to the main page, but afterwards, using the open_in_browser() command I get a page that asks for authentication in order to use the query:
dsds
2016-07-06 15:07:51 [scrapy] INFO: Spider opened
2016-07-06 15:07:51 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-07-06 15:07:51 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-07-06 15:07:52 [scrapy] DEBUG: Crawled (404) <GET https://account.shodan.io/robots.txt> (referer: None)
2016-07-06 15:07:52 [scrapy] DEBUG: Crawled (200) <GET https://account.shodan.io/login> (referer: None)
2016-07-06 15:07:52 [scrapy] DEBUG: Redirecting (302) to <GET https://www.shodan.io/?language=en> from <POST https://account.shodan.io/login>
2016-07-06 15:07:53 [scrapy] DEBUG: Crawled (200) <GET https://www.shodan.io/robots.txt> (referer: None)
2016-07-06 15:07:53 [scrapy] DEBUG: Crawled (200) <GET https://www.shodan.io/?language=en> (referer: https://account.shodan.io/login)
2016-07-06 15:07:53 [shodan.io] DEBUG: authentication succeed
2016-07-06 15:07:54 [scrapy] DEBUG: Crawled (200) <GET https://www.shodan.io/search?query=org%3A%22Instituto+Tecnol%C3%B3gico+y+de+Estudios+Superiores+de%22> (referer: https://www.shodan.io/?language=en)
ASDASDASDASDASDASDASDASDASD
2016-07-06 15:07:54 [scrapy] INFO: Closing spider (finished)
2016-07-06 15:07:54 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2231,
'downloader/request_count': 6,
'downloader/request_method_count/GET': 5,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 11759,
'downloader/response_count': 6,
'downloader/response_status_count/200': 4,
'downloader/response_status_count/302': 1,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 6, 20, 7, 54, 214825),
'log_count/DEBUG': 8,
'log_count/INFO': 7,
'request_depth_max': 2,
'response_received_count': 5,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'start_time': datetime.datetime(2016, 7, 6, 20, 7, 51, 797093)}

I found by enabling the cookies debugger COOKIES_DEBUG = True in the configuration file that the authentication was being done correctly. By messing with the configuration file, I managed to make the script work and crawl the site authenticated.
For anyone with the same problem, I addedROBOTSTXT_OBEY = False and changed the user agent to the one of a web browser: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36

Related

No Pages getting crawled - scrapy

The following spider code in Scrapy was developed to be used to crawl pages from americanas website:
# -*- coding: utf-8 -*-
import scrapy
import urllib
import re
import webscrape.items
import time
from urlparse import urljoin
from HTMLParser import HTMLParser
class AmericanasSpider(scrapy.Spider):
name = "americanas"
start_urls = ('http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-menuLista-alimentos/',)
source = webscrape.items.ImportSource ("Americanas")
def parse (self, response):
ind = 0
self.source.submit()
b = []
for c in response.xpath ('//div[#class="item-menu"]/ul'):
c1 = re.sub('[\t\n]','', c.xpath('//span [#class="menu-heading"]/text()').extract()[ind])
if (c1):
x = webscrape.items.Category(c1)
x.submit()
for b in c.xpath ('li'):
b1 = webscrape.items.Category( b.xpath('a/text()').extract()[0])
if (b1):
b1.setParent(x.getID())
b1.submit()
link = b.xpath ('#href').extract()
urla = urljoin (response.url, link)
request = scrapy.Request (urla, callback = self.parse_category)
request.meta['idCategory'] = b1.getID ()
yield request
for a in b.xpath ('ul/li/a/text()'):
a1 = webscrape.items.Category( a.extract())
a1.setParent(b1.getID())
a1.submit()
link = a.xpath ('#href').extract()
urla = urljoin (response.url, link)
request = scrapy.Request (urla, callback = self.parse_category)
request.meta['idCategory'] = a1.getID ()
yield request
ind = ind + 1
def parse_category(self, response):
# produtos na pagina
items = response.xpath('//div[#class="paginado"]//article[#class="single-product vitrine230 "]')
for item in items:
url = item.xpath('.//div[#itemprop="item"]/form/div[#class="productInfo"]/div]/a[#class="prodTitle"]/#href').extract()
urla = urljoin(response.url, link)
request = scrapy.Request (urla, callback = self.parse_product)
request.meta['idCategory'] = response.meta['idCategory']
yield request
# proxima pagina (caso exista)
nextpage = response.xpath('//div[#class="pagination"]/ul/li/a[#class="pure-button next"]/#href').extract()
if (nextpage):
link = nextpage[0]
urlb = urljoin(response.url, link)
self.log('Next Page: {0}'.format(nextpage))
request = scrapy.Request (urlb, callback = self.parse_category)
request.meta['idCategory'] = response.meta['idCategory']
yield request
def parse_product (self, response):
print response.url
title = response.xpath('//title/text()').extract()
self.log(u'Título: {0}'.format(title))
but i get the following output:
PS C:\Users\Natalia Oliveira\Desktop\Be Happy\behappy\import\webscrape> scrapy crawl americanas
2016-10-06 17:28:04 [scrapy] INFO: Scrapy 1.1.2 started (bot: webscrape)
2016-10-06 17:28:04 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'webscrape.spiders', 'REDIRECT_ENABLED': Fal
se, 'SPIDER_MODULES': ['webscrap.spiders'], 'BOT_NAME': 'webscrape'}
2016-10-06 17:28:04 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2016-10-06 17:28:05 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-10-06 17:28:05 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-10-06 17:28:05 [scrapy] INFO: Enabled item pipelines:
[]
2016-10-06 17:28:05 [scrapy] INFO: Spider opened
2016-10-06 17:28:05 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-10-06 17:28:05 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-10-06 17:28:05 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.m
c_id=home-menuLista-alimentos/> (referer: None)
2016-10-06 17:28:07 [scrapy] DEBUG: Filtered duplicate request: <GET http://www.americanas.com.br/loja/226795/alimentos-
e-bebidas?WT.mc_id=home-menuLista-alimentos/> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all dupli
cates)
2016-10-06 17:28:07 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.m
c_id=home-menuLista-alimentos/> (referer: http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-men
uLista-alimentos/)
2016-10-06 17:28:22 [scrapy] INFO: Closing spider (finished)
2016-10-06 17:28:22 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 931,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 80585,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'dupefilter/filtered': 60,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 10, 6, 20, 28, 22, 257000),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2016, 10, 6, 20, 28, 5, 346000)}
2016-10-06 17:28:22 [scrapy] INFO: Spider closed (finished)
I really don't know what is wrong here, because i'm a beginner in scrapy. Here's the wrong point?
The def parse is running as expected, so, I think the error should be in def parse_category or parse_product methods.
Your xpath is not correct and there is only one item-menu per page, i have removed the items logic as I don't know what they are. This will get you all the links from the item-menu ul, you can add back in whatever logic you like:
def parse(self, response):
for url in response.xpath('//div[#class="item-menu"]/ul/li[#class="item-linha"]/a/#href').extract():
if not url.startswith("http"):
url = response.urljoin(url)
request = scrapy.Request(url, callback=self.parse_category)
request.meta['idCategory'] = url # add whatever here
yield request
You next method is also over complicated, you don't need to worry about anything but the anchor tags with the prodTitle class:
def parse_category(self, response):
# produtos na pagina
urls = response.css('a.prodTitle::attr(href)').extract()
for url in urls:
request = scrapy.Request(url, callback=self.parse_product)
request.meta['idCategory'] = response.meta['idCategory']
yield request
# you want to check for the anchor with "Próxima" text
nextpage = response.xpath(u'//ul[#class="pure-paginator acrN"]/li/a[contains(.,"Próxima")]/#href').extract_first()
if nextpage:
self.log(u'Next Page: {0}'.format(nextpage))
request = scrapy.Request(nextpage, callback=self.parse_category)
request.meta['idCategory'] = response.meta['idCategory']
yield request
def parse_product(self, response):
print response.url
title = response.xpath('//title/text()').extract_first()
self.log(u'Título: {0}'.format(title))
If you run it now you will see lots of output like:
2016-10-06 23:25:15 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/314061/alimentos-e-bebidas/biscoitos?ofertas.offset=30
2016-10-06 23:25:15 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/342151/alimentos-e-bebidas/azeite-e-vinagre?ofertas.offset=30
2016-10-06 23:25:15 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/342129/alimentos-e-bebidas/barra-de-cereais?ofertas.offset=30
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/15815078/nan-comfor-1-formula-infantil-nestle-lata-800g> (referer: http://www.americanas.com.br/linha/314080/alimentos-e-bebidas/alimentacao-infantil)
http://www.americanas.com.br/produto/15815078/nan-comfor-1-formula-infantil-nestle-lata-800g
2016-10-06 23:25:16 [americanas] DEBUG: Título: Nan Comfor 1 Fórmula Infantil Nestlé Lata 800g - Americanas.com
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/316829/eletrodomesticos/adega-de-vinho> (referer: http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-menuLista-alimentos/)
2016-10-06 23:25:16 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/316829/eletrodomesticos/adega-de-vinho?ofertas.offset=30
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/7170286/goiabada-135g-diet-house> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/9955598/adocante-em-sache-fit-caixa-com-30-unidades-de-2-5g-uniao> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/285368/utilidades-domesticas/vinho> (referer: http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-menuLista-alimentos/)
http://www.americanas.com.br/produto/7170286/goiabada-135g-diet-house
2016-10-06 23:25:16 [americanas] DEBUG: Título: Goiabada 135g - Diet House - Americanas.com
http://www.americanas.com.br/produto/9955598/adocante-em-sache-fit-caixa-com-30-unidades-de-2-5g-uniao
2016-10-06 23:25:16 [americanas] DEBUG: Título: Adoçante Em Sache Fit Caixa Com 30 Unidades De 2,5g União - Americanas.com
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/121047374/barra-de-chocolate-ao-leite-lacta-150g-1-unidade> (referer: http://www.americanas.com.br/linha/314045/alimentos-e-bebidas/bomboniere)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/314080/alimentos-e-bebidas/alimentacao-infantil?ofertas.offset=30> (referer: http://www.americanas.com.br/linha/314080/alimentos-e-bebidas/alimentacao-infantil)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce?ofertas.offset=30> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/9800047/acucar-refinado-caixa-com-400-envelopes-x-5g-uniao-premium> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
http://www.americanas.com.br/produto/121047374/barra-de-chocolate-ao-leite-lacta-150g-1-unidade
2016-10-06 23:25:16 [americanas] DEBUG: Título: Barra de Chocolate Ao leite Lacta 150g - 1 unidade - Americanas.com

Scraping Data from a table with Scrapy but not Scraped Items

I'm trying out Scrapy for first time.After doing fair bit of research I got the basics. Now I was trying to get data of a table. It isn't working. not Scraped any data. Check below for source codes.
settings.py
BOT_NAME = 'car'
SPIDER_MODULES = ['car.spiders']
NEWSPIDER_MODULE ='car.spiders'
DEFAULT_ITEM_CLASS = 'car.items.Car58Item'
ITEM_PIPELINES = {'car.pipelines.JsonLinesItemExporter': 300}
items.py
from scrapy.item import Item,Field
class Car58Item(Item):
# define the fields for your item here like:
# name = scrapy.Field()
url = Field()
tip = Field()
name = Field()
size = Field()
region = Field()
amt = Field()
car_spider.py
# -*- coding=utf-8 -*-
from __future__ import absolute_import
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule,Spider
from car.items import Car58Item
class CarSpider (CrawlSpider):
name ='car'
allowed_domains = ['58.com']
start_urls = ['http://quanguo.58.com/ershouche']
rules = [Rule(LinkExtractor(allow=('/pn\d+')),'parse_item')] #//页面读取策略
def parse_item(self,response):
trs = response.xpath("//div[#id='infolist']/table[#class='tbimg']/tr")[1:-2]
items = []
#for tr in sel.xpath("id('infolist')/table/tr"):
for tr in trs:
item = Car58Item()
item['url'] = tr.xpath("td[#class='img']/a/#href").extract()
item['tip'] = tr.xpath("td[#class='t']/a/font/text()").extract()
item['name'] = tr.xpath("td[#class='t']/a[1]/text()").extract()
item['size'] = tr.xpath("td[#class='t']/p").extract()
item['region'] = tr.xpath("td[#class='tc']/a/text()").extract()
item['amt'] = tr.xpath("td[#class='tc']/b/text()").extract()
items.append(item)
return items
pipelines.py
# -*- coding: utf-8 -*-
import json
import codecs
class JsonLinesItemExporter(object):
def __init__(self):
self.file = codecs.open('car.json','w',encoding='utf-8')
def process_item(self, items, spider):
line = json.dumps(dict(items),ensure_ascii=False) + "\n"
self.file.write(line)
return items
def spider_closed(self,spider):
self.file.close()
i running scrapy in shell
[mayuping#i~/PycharmProjects/car] $**scrapy crawl car**
2016-05-18 10:35:36 [scrapy] INFO: Scrapy 1.0.6 started (bot: car)
2016-05-18 10:35:36 [scrapy] INFO: Optional features available: ssl, http11
2016-05-18 10:35:36 [scrapy] INFO: Overridden settings: {'DEFAULT_ITEM_CLASS': 'car.items.Car58Item', 'NEWSPIDER_MODULE': 'car.spiders', 'SPIDER_MODULES': ['car.spiders'], 'BOT_NAME': 'car'}
2016-05-18 10:35:36 [scrapy] INFO: Enabled extensions: CloseSpider, TelnetConsole, LogStats, CoreStats, SpiderState
2016-05-18 10:35:36 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2016-05-18 10:35:36 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2016-05-18 10:35:36 [scrapy] INFO: Enabled item pipelines: JsonLinesItemExporter
2016-05-18 10:35:36 [scrapy] INFO: Spider opened
2016-05-18 10:35:36 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-05-18 10:35:36 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-05-18 10:35:37 [scrapy] DEBUG: Redirecting (301) to <GET http://quanguo.58.com/ershouche/> from <GET http://quanguo.58.com/ershouche>
2016-05-18 10:35:39 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/> (referer: None)
2016-05-18 10:35:40 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn2/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:42 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn7/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:42 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn6/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:42 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn12/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:43 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn11/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:44 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn9/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:45 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn8/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:45 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn5/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:46 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn10/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:46 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn4/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:46 [scrapy] DEBUG: Crawled (200) <GET http://quanguo.58.com/ershouche/pn3/> (referer: http://quanguo.58.com/ershouche/)
2016-05-18 10:35:47 [scrapy] INFO: Closing spider (finished)
2016-05-18 10:35:47 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 5550,
'downloader/request_count': 13,
'downloader/request_method_count/GET': 13,
'downloader/response_bytes': 339809,
'downloader/response_count': 13,
'downloader/response_status_count/200': 12,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 5, 18, 2, 35, 47, 45187),
'log_count/DEBUG': 14,
'log_count/INFO': 7,
'request_depth_max': 1,
'response_received_count': 12,
'scheduler/dequeued': 13,
'scheduler/dequeued/memory': 13,
'scheduler/enqueued': 13,
'scheduler/enqueued/memory': 13,
'start_time': datetime.datetime(2016, 5, 18, 2, 35, 36, 733155)}
2016-05-18 10:35:47 [scrapy] INFO: Spider closed (finished)
but not scrapy any data...
[mayuping#i~/PycharmProjects/car] $more car.json
zero items output in car.json.
thanks
I problem is solved. parse_item is not indented in car_spider.py. after indented:
# -*- coding=utf-8 -*-
from __future__ import absolute_import
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule,Spider
from car.items import Car58Item
class CarSpider (CrawlSpider):
name ='car'
allowed_domains = ['58.com']
start_urls = ['http://quanguo.58.com/ershouche']
rules = [Rule(LinkExtractor(allow=('/pn\d+')),'parse_item')] #//页面读取策略
def parse_item(self,response):
trs = response.xpath("//div[#id='infolist']/table[#class='tbimg']/tr")[1:-2]
items = []
#for tr in sel.xpath("id('infolist')/table/tr"):
for tr in trs:
item = Car58Item()
item['url'] = tr.xpath("td[#class='img']/a/#href").extract()
item['tip'] = tr.xpath("td[#class='t']/a/font/text()").extract()
item['name'] = tr.xpath("td[#class='t']/a[1]/text()").extract()
# item['description'] = trs.xpath("/td[#class='t']/a/text()").extract()
item['size'] = tr.xpath("td[#class='t']/p").extract()
item['region'] = tr.xpath("td[#class='tc']/a/text()").extract()
item['amt'] = tr.xpath("td[#class='tc']/b/text()").extract()
items.append(item)
return items
scrapy's built in JSON export is OK.

Scrapy initialized function

I am following this topic to extract content from a website which has authentication. I have two versions of code, first one likes below
class FoodCrawler(InitSpider):
def parse(self, response):
pass
name = "theCrawler"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com"]
login_page = 'http://example.com/login'
def __init__(self, user, password, *args, **kwargs):
super(FoodCrawler, self).__init__(*args, **kwargs)
self.password = password
self.user = user
msg = 'The account will be used ' + user + ' ' + password
self.log(msg, level=logging.INFO)
def init_request(self):
"""This function is called before crawling starts."""
msg = {'email': self.user, 'password': self.password,
'reCaptchaResponse': '', 'rememberMe': 'true'}
headers = {'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json'}
yield Request(self.login_page, method='POST', body=json.dumps(msg), headers=headers,
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if json.loads(response.body)['isSuccess']:
self.log("Successfully logged in!")
self.initialized(response)
else:
self.log("Bad times :(")
def initialized(self, response=None):
self.log("initialized")
for url in self.start_urls:
yield self.make_requests_from_url(url)
in the second version, I just change initialized function, remaining is similar
def initialized(self, response=None):
self.log("initialized")
The difference is 1st version may embrace more functions while second one doesn't. Please see more (*) for your details. To demonstrate, please take a look at self.log("initialized"), I want to show 1st version doesn't work properly. Thus, when I run, first version can't show message DEBUG: initialized as self.log("initialized") as the second version does
The full log is yielded by first version was
2016-01-05 16:05:38 [scrapy] INFO: Scrapy 1.0.3 started (bot: MySpider)
2016-01-05 16:05:38 [scrapy] INFO: Optional features available: ssl, http11, boto
2016-01-05 16:05:38 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'MySpider.spiders', 'SPIDER_MODULES': ['MySpider.spiders'], 'CONCURRENT_REQUESTS': 4, 'BOT_NAME': 'MySpider'}
2016-01-05 16:05:39 [scrapy] INFO: Enabled extensions: CloseSpider, TelnetConsole, LogStats, CoreStats, SpiderState
2016-01-05 16:05:39 [theCrawler] INFO: The account will be used username#gmail.com 123456789
2016-01-05 16:05:39 [py.warnings] WARNING: /usr/lib/python2.7/site-packages/scrapy/utils/deprecate.py:155: ScrapyDeprecationWarning: `scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware` class is deprecated, use `scrapy.downloadermiddlewares.useragent.UserAgentMiddleware` instead
ScrapyDeprecationWarning)
2016-01-05 16:05:39 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, RotateUserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2016-01-05 16:05:39 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2016-01-05 16:05:39 [scrapy] INFO: Enabled item pipelines:
2016-01-05 16:05:39 [scrapy] INFO: Spider opened
2016-01-05 16:05:39 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-01-05 16:05:39 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-01-05 16:05:39 [scrapy] DEBUG: Crawled (200) <POST http://www.example.com/login> (referer: None)
2016-01-05 16:05:39 [theCrawler] DEBUG: Successfully logged in!
2016-01-05 16:05:39 [scrapy] INFO: Closing spider (finished)
2016-01-05 16:05:39 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 494,
'downloader/request_count': 1,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 1187,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 1, 5, 9, 5, 39, 363402),
'log_count/DEBUG': 3,
'log_count/INFO': 8,
'log_count/WARNING': 1,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2016, 1, 5, 9, 5, 39, 168955)}
2016-01-05 16:05:39 [scrapy] INFO: Spider closed (finished)
I would like to know why, could you please give any advice? Thank you in advance
[Updated]
import json, pdb, logging
from scrapy import Request
from scrapy.spiders.init import InitSpider
(*) the initialized function embraces more function such as self.my_requests() but this function doesn't work. Indeed, the script doesn't run into self.my_requests()
def initialized(self, response=None):
self.log("initialized")
self.my_requests()
def my_requests(self):
self.log("my_requests")
pdb.set_trace()
for url in self.start_urls:
yield self.make_requests_from_url(url)

Scrapy scrapes data but no output to file

I've been getting blank json files despite successfully being able to execute most of the lines in scrapy shell.
When I run the command scrapy crawl courses with my courses bot being:
from scrapy.spiders import CrawlSpider
from scrapy.linkextractors import LinkExtractor
from tutorial.items import CoursesItem
from bs4 import BeautifulSoup
import scrapy
class CoursesSpider(CrawlSpider):
name = 'courses'
allowed_domains = ['guide.berkeley.edu']
start_urls = ['http://guide.berkeley.edu/courses/ast',
]
def parse(self, response):
soup = BeautifulSoup(response.body_as_unicode(), 'lxml')
items = []
for course_info, course_desc, course_req in zip(soup.find_all('p',class_='courseblocktitle'), \
soup.find_all('p', class_='courseblockdesc'), \
soup.find_all('div', class_='course-section')):
item = CoursesItem()
item['title'] = course_info.text
item['description'] = course_desc.text
item['requirements'] = course_req.text
yield items
and by settings.py being
BOT_NAME = 'courses'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0.3'
# ITEM_PIPELINES = {
# 'tutorial.pipelines.JsonExportPipeline': 300
# }
FEED_URI = 'output.json'
FEED_FORMAT = 'json'
As you can see in the commented section, I've also tried making a pipeline.
My pipeline file looks like this:
from scrapy import signals
from scrapy.exporters import JsonLinesItemExporter
class JsonExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_spider.json' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = JsonLinesItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
But I feel that might not be where the error lies although it's possible since I largely followed a couple of tutorials I found.
I used BeautifulSoup to simplify the way I select the items.
Last but not least, the terminal looks like this after I run it.
2015-08-07 23:58:44 [scrapy] INFO: Scrapy 1.0.1 started (bot: courses)
2015-08-07 23:58:44 [scrapy] INFO: Optional features available: ssl, http11
2015-08-07 23:58:44 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'tu
torial.spiders', 'FEED_URI': 'output.json', 'SPIDER_MODULES': ['tutorial.spiders
'], 'BOT_NAME': 'courses', 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv
:39.0) Gecko/20100101 Firefox/39.0.3', 'FEED_FORMAT': 'json'}
2015-08-07 23:58:44 [scrapy] INFO: Enabled extensions: CloseSpider, FeedExporter
, TelnetConsole, LogStats, CoreStats, SpiderState
2015-08-07 23:58:44 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddl
eware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultH
eadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMidd
leware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2015-08-07 23:58:44 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2015-08-07 23:58:44 [scrapy] INFO: Enabled item pipelines:
2015-08-07 23:58:44 [scrapy] INFO: Spider opened
2015-08-07 23:58:44 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 i
tems (at 0 items/min)
2015-08-07 23:58:44 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6024
2015-08-07 23:58:45 [scrapy] DEBUG: Redirecting (301) to <GET http://guide.berke
ley.edu/courses/ast/> from <GET http://guide.berkeley.edu/courses/ast>
2015-08-07 23:58:45 [scrapy] DEBUG: Crawled (200) <GET http://guide.berkeley.edu
/courses/ast/> (referer: None)
2015-08-07 23:58:45 [scrapy] INFO: Closing spider (finished)
2015-08-07 23:58:45 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 537,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 22109,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 8, 8, 6, 58, 45, 600000),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2015, 8, 8, 6, 58, 44, 663000)}
2015-08-07 23:58:45 [scrapy] INFO: Spider closed (finished)
I've ran most of my options thoroughly. Running the singular option of --parse tells me that I'm off in parsing the items, but even then, I'd like to know where to go beyond the parse bug fix (ie outputting to json). Ultimately, I want to pipe all this data into a database.
I know it's a lot to look through, but any help is appreciated, thanks!
You write a wrong word. In parse function change items -> item.
def parse(self, response):
soup = BeautifulSoup(response.body_as_unicode(), 'lxml')
items = []
for ...
item = CoursesItem()
item['title'] = course_info.text
item['description'] = course_desc.text
item['requirements'] = course_req.text
yield items # -> item
`

Issue With Scrapy Cookie Implementation

I have what I think is a correct implementation of a spider that overrides two in built functions.
parse_start_url() and parse()
When I run the spider with the custom overridden parse() function commented out.
The spider runs fine, aggregates links using SgmlLinkExtractor, all ok.
But when I uncomment the custom parse() function the spider runs without error, but there is no output, so it must be the handling of requests and responses between functions. Ok sure.
I have actually spent a few too many hours trying to get this to work, using different approaches with overriding functions/ using InitSpider/BaseSpider structures etc. nothing ever seems to set cookies correctly.
I am on version 0.16.4 which is old, so perhaps there's an issue there?
* SOLVED *
Nevermind, I just solved it with a deep breath and a little bit of luck.
Revisited the approach of 'no middleware', using CrawlSpider, SgmlLinkExtractor() & overriding make_requests_from_url()
So I removed the block of code that was supposed to override parse(),
and added this:
def make_requests_from_url(self, url):
request = Request(url, cookies = {'somedomain.com.au+2':'national'}, dont_filter=True)
return request
SPIDER:
from scrapy.contrib.exporter import JsonItemExporter
from scrapy.contrib.spiders import Rule,CrawlSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.shell import inspect_response
from scrapy.http.cookies import CookieJar
from TM.items import TMItem
import json
import time
import datetime
import re
import sys
import os
COOKIES_DEBUG = True
COOKIES_ENABLED = True
SPIDER_NAME = "TKComAuSpider"
SPIDER_VERSION = "1.0"
class TKComAuSpider(CrawlSpider):
name = "TKComAuMusicSpecific"
allowed_domains = ["domain.com.au"]
global response_urls
response_urls = []
global site_section_category
global master_items
master_items = []
start_urls = [ "http://some.domain.com.au/shows/genre.aspx?c=2048" ]
rules = (Rule (SgmlLinkExtractor(allow=(".*page=[0-9]+.*", ),
restrict_xpaths=('//*[#id="ctl00_uiBodyMain_searchResultsControl_uiPaginateBottom_List"]/ul/li',))
, callback="parse_it", follow = True),
)
def parse(self, response):
request_with_cookies = Request(url=self.start_urls[0],cookies={'domain.com.au+2':'national'})
print '\n\n' + request_with_cookies.url + '\n\n'
yield request_with_cookies
def parse_start_url(self, response):
list(self.parse_it(response))
def parse_it(self, response):
spider_name = "TKComAuMusicSpecific"
doc_date = datetime.datetime.now().strftime("%d-%m-%y-%H:%M")
items = []
hxs = HtmlXPathSelector(response)
# RESPONSE ASSIGNMENT #
response_url = response.url
response_urls.append(response_url)
# cl = response.headers.getlist('Cookie')
# if cl:
# msg = "Sending cookies to: %s" % response_url + os.linesep
# msg += os.linesep.join("Cookie: %s" % c for c in cl)
# log.msg(msg, spider=spider, level=log.DEBUG)
# CUSTOM SITE_SECTION TO CREATE SPIDER CAT FROM RESPONSE_URL #
site_section_category = re.sub(r'^.*//[a-zA-Z0-9._-]+([^.?]+).*$',r'\1', response.url).title().replace('/', '')
spider_category = "TKTerms" + site_section_category
file_name = 'out/' + spider_category + ".out"
with open("log/response.log", 'a') as l:
l.write(doc_date + ' ' + ' spider: ' + spider_name + '\nresponse_url: ' + response_url
+ '\nsite_section_category: ' + site_section_category
+ '\nspider_category: ' + spider_category + '\n')
f = open(file_name, 'w')
for site in hxs.select('//*[#class="contentEvent"]'):
link = site.select('h6/a/#href').extract()
title = site.select('h6/a/text()').extract()
f.write("%s\n" % title)
master_items.append({"title":title[0],"item_type":spider_category})
yield TMItem(title=title[0],item_type=spider_category)
f.close()
json_out = 'json/' + spider_name + '.json'
f = open(json_out, 'w')
final_json = (json.dumps({"docs": [{"spider_name": SPIDER_NAME, "spider_version": SPIDER_VERSION},
{"doc_title": spider_name, "doc_date": doc_date,
"urls": response_urls}, master_items]}))
f.write(final_json)
f.close()
2014-04-30 13:15:46+1000 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, RetryMiddleware, DefaultHeadersMiddleware, RedirectMiddleware, CookiesMiddleware, HttpCompressionMiddleware, ChunkedTransferMiddleware, DownloaderStats
2014-04-30 13:15:46+1000 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2014-04-30 13:15:46+1000 [scrapy] DEBUG: Enabled item pipelines: JsonWriterPipelineLines, JsonWriterPipeline
2014-04-30 13:15:46+1000 [TKComAuMusicSpecific] INFO: Spider opened
2014-04-30 13:15:46+1000 [TKComAuMusicSpecific] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2014-04-30 13:15:46+1000 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6024
2014-04-30 13:15:46+1000 [scrapy] DEBUG: Web service listening on 0.0.0.0:6081
2014-04-30 13:15:46+1000 [TKComAuMusicSpecific] DEBUG: Redirecting (302) to <GET http://www.some.com.au/detection.aspx?rt=http%3a%2f%2fsome.domain.com.au%2fshows%2fgenre.aspx%3fc%3d2048> from <GET http://some.domain.com.au/shows/genre.aspx?c=2048>
2014-04-30 13:15:46+1000 [TKComAuMusicSpecific] DEBUG: Redirecting (302) to <GET http://some.domain.com.au/shows/genre.aspx?c=2048> from <GET http://www.some.com.au/detection.aspx?rt=http%3a%2f%2fsome.domain.com.au%2fshows%2fgenre.aspx%3fc%3d2048>
2014-04-30 13:15:46+1000 [TKComAuMusicSpecific] DEBUG: Crawled (200) <GET http://some.domain.com.au/shows/genre.aspx?c=2048> (referer: None)
http://some.domain.com.au/shows/genre.aspx?c=2048
2014-04-30 13:15:47+1000 [TKComAuMusicSpecific] DEBUG: Crawled (200) <GET http://some.domain.com.au/shows/genre.aspx?c=2048> (referer: http://some.domain.com.au/shows/genre.aspx?c=2048)
http://some.domain.com.au/shows/genre.aspx?c=2048
2014-04-30 13:15:47+1000 [TKComAuMusicSpecific] INFO: Closing spider (finished)
2014-04-30 13:15:47+1000 [TKComAuMusicSpecific] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1260,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 4,
'downloader/response_bytes': 146364,
'downloader/response_count': 4,
'downloader/response_status_count/200': 2,
'downloader/response_status_count/302': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2014, 4, 30, 3, 15, 47, 108720),
'log_count/DEBUG': 10,
'log_count/INFO': 4,
'request_depth_max': 2,
'response_received_count': 2,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'start_time': datetime.datetime(2014, 4, 30, 3, 15, 46, 220003)}
2014-04-30 13:15:47+1000 [TKComAuMusicSpecific] INFO: Spider closed (finished)

Categories

Resources