Scrapy crawls first page, does not follow other links

Scrapy crawls first page, does not follow other links - python

Im working on a scrapy that crawls this website :
Page 1: http://www.randstad.nl/mwp2/faces/baanZoeken?pagina=1&filters=vakgebied!5626
Subpage 1 example (from page 1) : http://www.randstad.nl/mwp2/faces/baanDetails?aanvraagnummer=1177658&_adf.ctrl-state=16ovo4scmu_4&sc=0&_afrLoop=15790145645866794
Page 2 : http://www.randstad.nl/mwp2/faces/baanDetails?aanvraagnummer=1509606&_adf.ctrl-state=16ovo4scmu_4&sc=0&_afrLoop=15790170887272918
So what (i think) goes wrong is it gets all the links from page 1, goes to the sub pages (so it goes to the links it extracted, "subpages") and then goes to page 2 and do it again, but i think that after page 1 it only get the first link (instead of all links of page 2) and then continue to page 3 and do the same.
I tried a lot of different code and i still can't get it right, i hope you can have a look at my code and help me out what i do wrong
Code Spider
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from craig.items import CraigItem
from scrapy.http import Request
import re
class CraigSpiderSpider(CrawlSpider):
name = "craig_spider"
allowed_domains = ["randstad.nl"]
start_urls = (
"http://www.randstad.nl/mwp2/faces/baanZoeken?pagina=1&filters=vakgebied!5626",
"http://www.randstad.nl/mwp2/faces/baanZoeken?"
)
rules = (Rule (SgmlLinkExtractor(allow=("filters=vakgebied!5626", "pagina=")), callback="parse", follow= True),
)
def parse(self, response):
sel = Selector(response)
#Haalt alle links op
for link in sel.xpath(".//a[contains(#class, 'outer-read-more-link')]/#href").extract():
yield Request(link, callback=self.parse)
#Gaat alle links af en haalt alle text op
text_list = sel.xpath('//div[#id="basePage:page:twoColumn:r2:0:functieOmschrijvingPanel::content"]/text()').extract()
title_list = sel.xpath('//div[#id="basePage:page:panelTitleHeader"]//td[#class="af_panelBox_header-text"]//h1[#class="af_panelBox_header-element"]/text()').extract()
label_samenvatting = sel.xpath('//div[#id="basePage:page:twoColumn:r1:0:pfl1b"]//table//td//label/text()').extract()
opleidingniveau_list = sel.xpath('//div[#id="basePage:page:twoColumn:r1:0:pl1"]//ul//li/text()').extract()
soortbaan_list = sel.xpath('//table[#id="basePage:page:twoColumn:r1:0:soortDienstverbandRNL"]//td[#class="AFContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
uren_per_week_list = sel.xpath('//tr[#id="basePage:page:twoColumn:r1:0:it5"]//td[#class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
vakgebied_list = sel.xpath('//tr[#id="basePage:page:twoColumn:r1:0:vakgebieden"]//td[#class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]//li/text()').extract()
branche_list = sel.xpath('//tr[#id="basePage:page:twoColumn:r1:0:aanvraagBranch"]//td[#class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
datum = sel.xpath('//span[#class="date-changed"]/text()').extract()
if text_list:
title = ' '.join(title_list)
text = ' '.join(text_list)
samenvatting = ' '.join(label_samenvatting)
opleidingniveau = ' '.join(opleidingniveau_list)
soortbaan = ' '.join(soortbaan_list)
urenperweek = ' '.join(uren_per_week_list)
vakgebied = ' '.join(vakgebied_list)
branche = ' '.join(branche_list)
item = CraigItem()
item['link'] = response.url
item['title'] = title
item['text'] = text
item['samenvatting'] = samenvatting
item['opleidingniveau'] = opleidingniveau
item['soortbaan'] = soortbaan
item['urenperweek'] = urenperweek
item['vakgebied'] = vakgebied
item['branche'] = branche
item['date'] = datum
yield item
Code Items
from scrapy.item import Item, Field
class CraigItem(Item):
title = Field()
text = Field()
link = Field()
site = Field()
date = Field()
samenvatting = Field()
opleidingniveau = Field()
soortbaan = Field()
urenperweek = Field()
vakgebied = Field()
branche = Field()

I think you should use CrawlSpider when you need following links, but not BaseSpider.
class CraigSpider(CrawlSpider):

Related

scraping multiple pages with scrapy

I am trying to use scrapy to scrape a website that has several pages of information.
my code is:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from tcgplayer1.items import Tcgplayer1Item
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["http://www.tcgplayer.com/"]
start_urls = ["http://store.tcgplayer.com/magic/journey-into-nyx?PageNumber=1"]
def parse(self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[#class='magicCard']")
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[#class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[#class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[#class='seller']/a/text())").extract()
yield item
I am trying to scrape all the pages until it reaches the end of the pages ... sometimes there will be more pages than others so its hard to say exactly where the page numbers end.

The idea is to increment pageNumber until there is no titles found. If no titles on the page - throw CloseSpider exception to stop the spider:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from tcgplayer1.items import Tcgplayer1Item
URL = "http://store.tcgplayer.com/magic/journey-into-nyx?pageNumber=%d"
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["tcgplayer.com"]
start_urls = [URL % 1]
def __init__(self):
self.page_number = 1
def parse(self, response):
print self.page_number
print "----------"
sel = Selector(response)
titles = sel.xpath("//div[#class='magicCard']")
if not titles:
raise CloseSpider('No more pages')
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[#class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[#class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[#class='seller']/a/text())").extract()
yield item
self.page_number += 1
yield Request(URL % self.page_number)
This particular spider would go throw all 8 pages of the data, then stop.
Hope that helps.

Scrapy pipeline error cannot import name

I am new to python programming and using scrapy. I have setup my crawler and so far it was working until I got to the point where I wanted to figure out how to download images. The error I am getting is cannot import name NsiscrapePipeline. I dont know what I am doing wrong and I dont understand some of the documentation as I am new. Please help
Items File
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/#src').extract()
item['url'] = site.select('td[1]/a/#href').extract()
item['image_urls'] = site.select('td/a[3]/img/#data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
settings
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
Pipelines This is where I am unsure if I am missing something
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
error
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline

You tried to pass list, but this function accepts only string. Pass only one element from list (for example list[0]).

Heres my final code thats working. There was two issues
1: I was missing the second backslash that needede to be in the request --> //td[1]/a[3]/img/#data-original
2: I had to check the full URL in which the image would be displayed and join them together which was the main URL or the allowed URL and the image URL.
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/#data-original').extract()]
url.append(urls)
return url

That isn't part of the library :) - at least by looking at their current master branch
I think you're looking for ImagesPipeline
Their example may help! example
p.s. I don't think you custom name the class - at least not by how scapy is designed; i'm reasonably sure you use their class ;)

Scrapy TypeError: Request url must be str or unicode [duplicate]

I am new to python programming and using scrapy. I have setup my crawler and so far it was working until I got to the point where I wanted to figure out how to download images. The error I am getting is cannot import name NsiscrapePipeline. I dont know what I am doing wrong and I dont understand some of the documentation as I am new. Please help
Items File
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/#src').extract()
item['url'] = site.select('td[1]/a/#href').extract()
item['image_urls'] = site.select('td/a[3]/img/#data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
settings
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
Pipelines This is where I am unsure if I am missing something
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
error
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline

You tried to pass list, but this function accepts only string. Pass only one element from list (for example list[0]).

Heres my final code thats working. There was two issues
1: I was missing the second backslash that needede to be in the request --> //td[1]/a[3]/img/#data-original
2: I had to check the full URL in which the image would be displayed and join them together which was the main URL or the allowed URL and the image URL.
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/#data-original').extract()]
url.append(urls)
return url

That isn't part of the library :) - at least by looking at their current master branch
I think you're looking for ImagesPipeline
Their example may help! example
p.s. I don't think you custom name the class - at least not by how scapy is designed; i'm reasonably sure you use their class ;)

Scrapy Crawls only 1st page

heya I am making a Project using scrapy in which I need to scrap the business details from a business directory http://directory.thesun.co.uk/find/uk/computer-repair
the problem I am facing is: when I am trying to crawl the page my crawler fetches the details of only 1st page whereas I need to fetch the details of the rest 9 pages also; that is all 10 pages..
i am showing below my Spider code and items.py and settings .py
please see my code and help me to solve it
spider code::
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
start_urls = [
"http://directory.thesun.co.uk/find/uk/computer-repair"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
items = []
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
items.append(item)
return items
My items.py code is as follows::
from scrapy.item import Item, Field
class Project2Item(Item):
Catogory = Field()
Bussiness_name = Field()
Description = Field()
Number = Field()
Web_url = Field()
adress_name = Field()
Photo_name = Field()
Photo_path = Field()
my settings.py is:::
BOT_NAME = 'project2'
SPIDER_MODULES = ['project2.spiders']
NEWSPIDER_MODULE = 'project2.spiders'
please help
me to extract details from other pages too...

Fetching description .select('span/text()') you are selecting text from ALL spans in //div[#class="abTbl "].
To extract last span you can use 'span[last()]/text()' xpath
btw this http://www.w3schools.com/xpath/xpath_syntax.asp should help you with XPathes

Scrapy Recursive download of Content

After banging my head several time, I am finally coming here.
Problem : I am trying to download the content of each of the craiglist posting. By content I mean the "posting body" like description of the cell phone. Looking for a new old phone since iPhone is done with all excitement.
The code is an awesome work by Michael Herman.
My Spider Class
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import *
from craig.items import CraiglistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow= True),
)
def parse_items(self,response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
items = []
for titles in titles:
item = CraiglistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
And the Item class
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
Since the code will traverse many links , hence I wanted to save the description of each cell phone in sepearte csv but one more column in csv will be fine also.
Any lead !!!

Instead of returning items in parse_items method you should return/yield scrapy Request instance in order to get the description from the item page, link and title you can pass inside of an Item, and Item inside of the meta dictionary:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import *
from scrapy.item import Item, Field
class CraiglistSampleItem(Item):
title = Field()
link = Field()
description = Field()
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://minneapolis.craigslist.org/moa/"]
rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//p[#class="nextpage"]',))
, callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//span[#class='pl']")
for title in titles:
item = CraiglistSampleItem()
item["title"] = title.select("a/text()").extract()[0]
item["link"] = title.select("a/#href").extract()[0]
url = "http://minneapolis.craigslist.org%s" % item["link"]
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
Run it and see additional description column in your output csv file.
Hope that helps.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy crawls first page, does not follow other links - python

I think you should use CrawlSpider when you need following links, but not BaseSpider. class CraigSpider(CrawlSpider):

Related

scraping multiple pages with scrapy

Scrapy pipeline error cannot import name

Scrapy TypeError: Request url must be str or unicode [duplicate]

Scrapy Crawls only 1st page

Scrapy Recursive download of Content

Categories

Resources