Scrapy: Debug Redirecting (301) - python

Before I was getting the error "HTTP status code is not handled or not allowed", I modified the USER_AGENT that was in default mode and now I am getting this error:
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
items = response.xpath(
'//div[contains(#class,"section_OLXad-list")]//li[contains'
'(#class,"item")]'
)
for item in items:
url = item.xpath(
".//a[contains(#class,'OLXad-list-link')]/#href"
).extract_first()
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath(
'//li[contains(#class,"item next")]//a/#href'
).extract_first()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath(
'//div[contains(#class,"photos")]//a/#href'
).extract()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath(
'normalize-space(//h1[contains(#id,"ad_title")]//.)'
).extract_first()
item['price'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-price")]'
'//span[contains(#class,"actual-price")]//.)'
).extract_first()
item['details'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item
trying to execute the .py file in the terminal, I get the following message:
2022-01-13 12:36:36 [scrapy.core.engine] INFO: Spider opened
2022-01-13 12:36:36 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-01-13 12:36:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/robots.txt> from <GET http://pe.olx.com.br/robots.txt>
2022-01-13 12:36:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://pe.olx.com.br/robots.txt> (referer: None)
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/imoveis/aluguel> from <GET http://pe.olx.com.br/imoveis/aluguel>
Does anyone know what might be causing this problem?
P.s.: I have tried these solutions Python Scrapy 301 redirects

It's just redirected from http to https so there's no problem there.
Your xpath is completely wrong. I fixed it in parse, and I fixed 3 xpaths in parse_detail as an example, but you need to fix the rest of them.
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response, self)
items = response.xpath('//ul[#id="ad-list"]/li')
for item in items:
url = item.xpath('.//a/#href').get()
if url:
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath('//a[#data-lurker-detail="next_page"]/#href').get()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath('//img[#class="image "]/#src').get()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath('//h1/text()').get()
item['price'] = response.xpath('//h2/text()').get()
item['details'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item

Related

Scrapy doesn't work for turning all the pages

I want to crawl the whole product category, but it seems that it works well to some point and than it stops.
Here is my code:
import scrapy
from Demo.items import DemoItem
class ProductSpider(scrapy.Spider):
name='black1'
start_urls = [ 'https://octopart.com/search?category_ids=4215&start=0' ]
def parse(self,response):
items = DemoItem()
for product in response.xpath("//div[#class='serp-card-header media']/div[#class='media-body']"):
name = product.xpath(".//a/span[#class='part-card-manufacturer']/text()").extract()
ver = product.xpath(".//a/span[#class='part-card-mpn']/text()").extract()
items['product_name'] = ''.join(name).strip()
items['product_code'] = ''.join(ver).strip()
yield items
next_page = response.xpath("//a[contains(text(), 'Next')]/#href").extract_first()
print next_page
if next_page is not None:
print next_page
next_page_link = response.urljoin(next_page)
print next_page_link
yield scrapy.Request(url=next_page_link, callback=self.parse)
And the outcome:
https://octopart.com/search?category_ids=4215&start=200
2019-03-06 13:51:46 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://octopart.com/search?category_ids=4215&start=200> (referer: https://octopart.com/search?category_ids=4215&start=190)
2019-03-06 13:51:46 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://octopart.com/search?category_ids=4215&start=200>: HTTP status code is not handled or not allowed

item loader skip values scrapy

I'm using item loader with scrapy from multiple page, the item loader returns empty dictionaries for some pages though when i use same rules to parse only these pages it returns the values, anyone could know why?
spider code:
class AllDataSpider(scrapy.Spider):
name = 'all_data' # spider name
allowed_domains = ['amazon.com']
# write the start url
start_urls = ["https://www.amazon.com/s? bbn=2619533011&rh=n%3A2619533011%2Cp_n_availability%3A2661601011&ie=UTF8&qid =1541604856&ref=lp_2619533011_nr_p_n_availability_1"]
custom_settings = {'FEED_URI': 'pets_.csv'} # write csv file name
def parse(self, response):
'''
function parses item information from category page
'''
self.category = response.xpath('//span[contains(#class, "nav-a-
content")]//text()').extract_first()
urls = response.xpath('//*[#data-asin]//#data-asin').extract()
for url in urls:
base = f"https://www.amazon.com/dp/{url}"
yield scrapy.Request(base, callback=self.parse_item)
next_page = response.xpath('//*
[text()="Next"]//#href').extract_first()
if next_page is not None:
yield scrapy.Request(response.urljoin(next_page),
dont_filter=True)
def parse_item(self, response):
loader = AmazonDataLoader(selector=response)
loader.add_xpath("Availability", '//div[contains(#id,
"availability")]//span//text()')
loader.add_xpath("NAME", '//h1[#id="title"]//text()')
loader.add_xpath("ASIN", '//*[#data-asin]//#data-asin')
loader.add_xpath("REVIEWS", '//span[contains(#id,
"Review")]//text()')
rank_check = response.xpath('//*[#id="SalesRank"]//text()')
if len(rank_check) > 0:
loader.add_xpath("RANKING", '//*[#id="SalesRank"]//text()')
else:
loader.add_xpath("RANKING", '//span//span[contains(text(), "#")]
[1]//text()')
loader.add_value("CATEGORY", self.category)
return loader.load_item()
for some pages it returns all values, for some pages it returns just the category, and for other "that follow same rules when parsing them only" it returns nothing, it also close the spider before finishing and without errors
DEBUG: Scraped from <200 https://www.amazon.com/dp/B0009X29WK>
{'ASIN': 'B0009X29WK',
'Availability': 'In Stock.',
'NAME': " Dr. Elsey's Cat Ultra Premium Clumping Cat Litter, 40 pound bag ( "
'Pack May Vary ) ',
'RANKING': '#1',
'REVIEWS': '13,612'}
2019-01-21 21:13:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/dp/B01N9KSITZ> (referer: https://www.amazon.com/s?i=pets&bbn=2619533011&rh=n%3A2619533011%2Cp_n_availability%3A2661601011&lo=grid&page=2&ie=UTF8&qid=1548097190&ref=sr_pg_1)
2019-01-21 21:13:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/dp/B01N9KSITZ>
{}

Scrapy stops scraping but continues to crawl

I’m trying to scrape different information from several pages of a website.
Until the sixteenth page, everything works: the pages are crawled, scraped and the information stock in my database, however after the sixteenth page, it stops scraping but continues to crawl.
I checked the website and there are more of 470 pages with information. The HTML tags are the same, so I don't understand why it stopped scraping.
Python:
def url_lister():
url_list = []
page_count = 1
while page_count < 480:
url = 'https://www.active.com/running?page=%s' %page_count
url_list.append(url)
page_count += 1
return url_list
class ListeCourse_level1(scrapy.Spider):
name = 'ListeCAP_ACTIVE'
allowed_domains = ['www.active.com']
start_urls = url_lister()
def parse(self, response):
selector = Selector(response)
for uneCourse in response.xpath('//*[#id="lpf-tabs2-a"]/article/div/div/div/a[#itemprop="url"]'):
loader = ItemLoader(ActiveItem(), selector=uneCourse)
loader.add_xpath('nom_evenement', './/div[2]/div/h5[#itemprop="name"]/text()')
loader.default_input_processor = MapCompose(string)
loader.default_output_processor = Join()
yield loader.load_item()
pass
The shell:
> 2018-01-23 17:22:29 [scrapy.core.scraper] DEBUG: Scraped from <200
> https://www.active.com/running?page=15>
> {
> 'nom_evenement': 'Enniscrone 10k run & 5k run/walk',
> }
> 2018-01-23 17:22:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.active.com/running?page=16> (referer: None)
> --------------------------------------------------
> SCRAPING DES ELEMENTS EVENTS
> --------------------------------------------------
> 2018-01-23 17:22:34 [scrapy.extensions.logstats] INFO: Crawled 17 pages (at 17 pages/min), scraped 155 items (at 155 items/min)
> 2018-01-23 17:22:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.active.com/running?page=17> (referer: None)
>
> --------------------------------------------------
> SCRAPING DES ELEMENTS EVENTS
> -------------------------------------------------- 2018-01-23 17:22:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET
> https://www.active.com/running?page=18> (referer: None)
> --------------------------------------------------
> SCRAPING DES ELEMENTS EVENTS
> -------------------------------------------------- 2018-01-23 17:22:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET
> https://www.active.com/running?page=19> (referer: None)
This is probably caused by the fact that there are only 17 pages with the content you are looking for, while you instruct Scrapy to visit all 480 pages of form https://www.active.com/running?page=NNN. A better approach is to check on each page you visit that there is a next page and only in that case yield Request to the next page.
So, I would refactor your code to something like (not tested):
class ListeCourse_level1(scrapy.Spider):
name = 'ListeCAP_ACTIVE'
allowed_domains = ['www.active.com']
base_url = 'https://www.active.com/running'
start_urls = [base_url]
def parse(self, response):
selector = Selector(response)
for uneCourse in response.xpath('//*[#id="lpf-tabs2-a"]/article/div/div/div/a[#itemprop="url"]'):
loader = ItemLoader(ActiveItem(), selector=uneCourse)
loader.add_xpath('nom_evenement', './/div[2]/div/h5[#itemprop="name"]/text()')
loader.default_input_processor = MapCompose(string)
loader.default_output_processor = Join()
yield loader.load_item()
# check for next page link
if response.xpath('//a[contains(#class, "next-page")]'):
next_page = response.meta.get('page_number', 1) + 1
next_page_url = '{}?page={}'.format(base_url, next_page)
yield scrapy.Request(next_page_url, callback=self.parse, meta={'page_number': next_page})

Scrapy Splash is always returning the same page

For each of several Disqus users, whose profile urls are known in advance, I want to scrape their names and usernames of their followers. I'm using scrapy and splash do to so. However, when I'm parsing the responses, it seems that it is always scraping the page of the first user. I tried setting wait to 10 and dont_filter to True, but it isn't working. What should I do now?
Here is my spider:
import scrapy
from disqus.items import DisqusItem
class DisqusSpider(scrapy.Spider):
name = "disqusSpider"
start_urls = ["https://disqus.com/by/disqus_sAggacVY39/", "https://disqus.com/by/VladimirUlayanov/", "https://disqus.com/by/Beasleyhillman/", "https://disqus.com/by/Slick312/"]
splash_def = {"endpoint" : "render.html", "args" : {"wait" : 10}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url = url, callback = self.parse_basic, dont_filter = True, meta = {
"splash" : self.splash_def,
"base_profile_url" : url
})
def parse_basic(self, response):
name = response.css("h1.cover-profile-name.text-largest.truncate-line::text").extract_first()
disqusItem = DisqusItem(name = name)
request = scrapy.Request(url = response.meta["base_profile_url"] + "followers/", callback = self.parse_followers, dont_filter = True, meta = {
"item" : disqusItem,
"base_profile_url" : response.meta["base_profile_url"],
"splash": self.splash_def
})
print "parse_basic", response.url, request.url
yield request
def parse_followers(self, response):
print "parse_followers", response.meta["base_profile_url"], response.meta["item"]
followers = response.css("div.user-info a::attr(href)").extract()
DisqusItem is defined as follows:
class DisqusItem(scrapy.Item):
name = scrapy.Field()
followers = scrapy.Field()
Here are the results:
2017-08-07 23:09:12 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://localhost:8050/render.html> (referer: None)
parse_followers https://disqus.com/by/disqus_sAggacVY39/ {'name': u'Trailer Trash'}
2017-08-07 23:09:14 [scrapy.extensions.logstats] INFO: Crawled 5 pages (at 5 pages/min), scraped 0 items (at 0 items/min)
2017-08-07 23:09:18 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://localhost:8050/render.html> (referer: None)
parse_followers https://disqus.com/by/VladimirUlayanov/ {'name': u'Trailer Trash'}
2017-08-07 23:09:27 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://localhost:8050/render.html> (referer: None)
parse_followers https://disqus.com/by/Beasleyhillman/ {'name': u'Trailer Trash'}
2017-08-07 23:09:40 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://localhost:8050/render.html> (referer: None)
parse_followers https://disqus.com/by/Slick312/ {'name': u'Trailer Trash'}
Here is the file settings.py:
# -*- coding: utf-8 -*-
# Scrapy settings for disqus project
#
BOT_NAME = 'disqus'
SPIDER_MODULES = ['disqus.spiders']
NEWSPIDER_MODULE = 'disqus.spiders'
ROBOTSTXT_OBEY = False
SPLASH_URL = 'http://localhost:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
DUPEFILTER_CLASS = 'scrapyjs.SplashAwareDupeFilter'
DUPEFILTER_DEBUG = True
DOWNLOAD_DELAY = 10
I was able to get it to work using SplashRequest instead of scrapy.Request.
ex:
import scrapy
from disqus.items import DisqusItem
from scrapy_splash import SplashRequest
class DisqusSpider(scrapy.Spider):
name = "disqusSpider"
start_urls = ["https://disqus.com/by/disqus_sAggacVY39/", "https://disqus.com/by/VladimirUlayanov/", "https://disqus.com/by/Beasleyhillman/", "https://disqus.com/by/Slick312/"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse_basic, dont_filter = True, endpoint='render.json',
args={
'wait': 2,
'html': 1
})

Scrapy skipping /browse/ that are not from /browse/ referer

I have a problem where my crawler is skipping browse pages that don't have a browse referrer.
What I'm trying to do it parse all pages that have a have /browse/ in the URL regardless of the referrer.
The following is my code(updated according to paul t):
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from wallspider.items import Website
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class anchorspider(CrawlSpider):
name = "newbrowsepages"
allowed_domains = ["mydomain.com"]
start_urls = ["http://www.mydomain.com/"]
rules = (
Rule (SgmlLinkExtractor(allow=('/browse/', ),)
, callback="parse_links", follow= True, process_links=lambda links: [link for link in links if not link.nofollow],),
Rule(SgmlLinkExtractor(allow=(),deny=('/[1-9]$', '(bti=)[1-9]+(?:\.[1-9]*)?', '(sort_by=)[a-zA-Z]', '(sort_by=)[1-9]+(?:\.[1-9]*)?', '(ic=32_)[1-9]+(?:\.[1-9]*)?', '(ic=60_)[0-9]+(?:\.[0-9]*)?', '(search_sort=)[1-9]+(?:\.[1-9]*)?', 'browse-ng.do\?', '/page/', '/ip/', 'out\+value', 'fn=', 'customer_rating', 'special_offers', 'search_sort=&', 'facet=' ))),
)
def parse_start_url(self, response):
return list(self.parse_links(response))
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a')
domain = 'http://www.mydomain.com'
for link in links:
class_text = ''.join(link.select('./#class').extract())
title = ''.join(link.select('./#class').extract())
url = ''.join(link.select('./#href').extract())
meta = {'title':title,}
meta = {'class_text':class_text,}
yield Request(domain+url, callback = self.parse_page, meta=meta,)
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
for site in sites:
item = Website()
item['class_text']=response.meta['class_text']
item['url'] = response.url
item['title'] = site.xpath('/html/head/title/text()').extract()
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[#rel="canonical"]/#href').extract()
yield item
My console log:
2014-01-28 12:22:03-0800 [newbrowsepages] DEBUG: Crawled (200) <GET http://www.mydomain.com/ip/Ad-tech-Ultimate-Strength-Mini-8-Glue-Sticks-24-ct/17404367> (referer: http://www.mydomain.com/browse/crafts/other-arts-crafts/2637_667479_1043549/?amp;ic=48_0&ref=+422937&catNavId=667479&povid=P1171-C1110.2784+1455.2776+1115.2956-L352)
2014-01-28 12:22:03-0800 [newbrowsepages] DEBUG: Crawled (200) <GET http://www.mydomain.com/ip/Wood-Scrabble-Pendant-Tiles-Rectangle-18x20mm-100/30108666> (referer: http://www.mydomain.com/browse/crafts/other-arts-crafts/2637_667479_1043549/?amp;ic=48_0&ref=+422937&catNavId=667479&povid=P1171-C1110.2784+1455.2776+1115.2956-L352)
2014-01-28 12:22:03-0800 [newbrowsepages] DEBUG: Crawled (200) <GET http://www.mydomain.com/browse/apparel/women/5438_133162/> (referer: http://www.mydomain.com/browse/apparel/backpacks/5438_1045799_1045801_133211/?_refineresult=true&povid=P1171-C1110.2784+1455.2776+1115.2956-L136)
2014-01-28 12:22:03-0800 [newbrowsepages] DEBUG: Scraped from <200 http://www.mydomain.com/ip/Advantus-Paper-Holder/24575774>
{'canonical': [u'http://www.mydomain.com/ip/Advantus-Paper-Holder/24575774'],
'class_text': '',
'referer': 'http://www.mydomain.com/browse/crafts/craft-storage/2637_667479_1021741/?amp;ic=48_0&ref=+420081&catNavId=667479&povid=P1171-C1110.2784+1455.2776+1115.2956-L357',
'title': [u'Advantus Paper Holder: Crafts : mydomain.com '],
'url': 'http://www.mydomain.com/ip/Advantus-Paper-Holder/24575774'}
2014-01-28 13:45:36-0800 [newbrowsepages] DEBUG: Crawled (200) <GET http://www.mydomain.com/browse/party-occasions/plants-artificial-flowers/2637_79907/?_refineresult=true&povid=P1171-C1110.2784+1455.2776+1115.2956-L355> (referer: http://www.mydomain.com/)
2014-01-28 13:45:36-0800 [newbrowsepages] DEBUG: Redirecting (301) to <GET http://www.mydomain.com/browse/crafts/craft-storage/2637_667479_1021741/?amp;ic=48_0&ref=+420081&catNavId=667479&povid=P1171-C1110.2784+1455.2776+1115.2956-L357> from <GET http://www.mydomain.com/browse/_/N-904x?amp%3Bic=48_0&amp%3Bref=+420081&catNavId=667479&povid=P1171-C1110.2784+1455.2776+1115.2956-L357>
2014-01-28 13:45:37-0800 [newbrowsepages] DEBUG: Crawled (200) <GET http://www.mydomain.com/browse/party-occasions/art-supplies/2637_667479_1094401/?_refineresult=true&povid=P1171-C1110.2784+1455.2776+1115.2956-L354> (referer: http://www.mydomain.com/)
Changed Rules to:
rules = (
Rule(SgmlLinkExtractor(allow=(),), follow= True,),
Rule(SgmlLinkExtractor(allow=('/browse/', ),deny=('/[1-9]$', '(bti=)[1-9]+(?:\.[1-9]*)?', '(sort_by=)[a-zA-Z]', '(sort_by=)[1-9]+(?:\.[1-9]*)?', '(ic=32_)[1-9]+(?:\.[1-9]*)?', '(ic=60_)[0-9]+(?:\.[0-9]*)?', '(search_sort=)[1-9]+(?:\.[1-9]*)?', 'browse-ng.do\?', '/page/', '/ip/', 'out\+value', 'fn=', 'customer_rating', 'special_offers', 'search_sort=&', 'facet=' )), callback="parse_links"),
I see at least 3 issues here:
your first rule references "parse_items" but it's not defined
your parse_start_url should return the list you're building
in parse_page, you should have item = Website() in each loop iteration, and use yield item at the end of each iteration
Spider code with these fixes:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from wallspider.items import Website
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class classspider(CrawlSpider):
name = "newbrowsepages"
allowed_domains = ["mydomain.com"]
start_urls = ["http://www.mydomain.com/"]
rules = (
Rule (SgmlLinkExtractor(allow=('/browse/', ),)
, callback="parse_items", follow= True, process_links=lambda links: [link for link in links if not link.nofollow],),
Rule(SgmlLinkExtractor(allow=(),deny=('/[1-9]$', '(bti=)[1-9]+(?:\.[1-9]*)?', '(sort_by=)[a-zA-Z]', '(sort_by=)[1-9]+(?:\.[1-9]*)?', '(ic=32_)[1-9]+(?:\.[1-9]*)?', '(ic=60_)[0-9]+(?:\.[0-9]*)?', '(search_sort=)[1-9]+(?:\.[1-9]*)?', 'browse-ng.do\?', '/page/', '/ip/', 'out\+value', 'fn=', 'customer_rating', 'special_offers', 'search_sort=&', 'facet=' ))),
)
def parse_start_url(self, response):
return list(self.parse_links(response))
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a')
domain = 'http://www.mydomain.com'
for link in links:
class_text = ''.join(link.select('./#class').extract())
title = ''.join(link.select('./#class').extract())
url = ''.join(link.select('./#href').extract())
meta = {'title':title,}
meta = {'class_text':class_text,}
yield Request(domain+url, callback = self.parse_page, meta=meta,)
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
for site in sites:
item = Website()
item['class_text']=response.meta['class_text']
item['url'] = response.url
item['title'] = site.xpath('/html/head/title/text()').extract()
item['referer'] = response.request.headers.get('Referer')
item['canonical'] = site.xpath('//head/link[#rel="canonical"]/#href').extract()
yield item

Categories

Resources