Scrapy crawler will not crawl any webpages - python

I have been trying to get this crawler working but I keep getting errors.
Can anyone suggest any ways to get it to run?
The main spider code is
import scrapy
from scrapy.spiders import Spider
from scrapy.selector import Selector
class gameSpider(scrapy.Spider):
name = "game_spider.py"
allowed_domains = ["*"]
start_urls = [
"http://www.game.co.uk/en/grand-theft-auto-v-with-gta-online-3-500-000-1085837?categoryIdentifier=706209&catGroupId="
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//ul[#class="directory-url"]/li')
items = []
for site in sites:
item = Website()
item['name'] = site.xpath('//*[#id="details301149"]/div/div/h2/text()').extract()
"""item['link'] = site.xpath('//a/#href').extract()
item['description'] = site.xpath('//*[#id="overview"]/div[3]()').re('-\s[^\n]*\\r')"""
items.append(item)
print items
return items
The item code is
import scrapy
class GameItem(Item):
name = Field()
pass

Your start_urls link returns erorr 500.
There's no items.
In [7]: sites = response.xpath('//ul[#class="directory-url"]/li')
In [8]: sites
Out[8]: []

Related

scraping multiple pages with scrapy

I am trying to use scrapy to scrape a website that has several pages of information.
my code is:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from tcgplayer1.items import Tcgplayer1Item
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["http://www.tcgplayer.com/"]
start_urls = ["http://store.tcgplayer.com/magic/journey-into-nyx?PageNumber=1"]
def parse(self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[#class='magicCard']")
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[#class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[#class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[#class='seller']/a/text())").extract()
yield item
I am trying to scrape all the pages until it reaches the end of the pages ... sometimes there will be more pages than others so its hard to say exactly where the page numbers end.
The idea is to increment pageNumber until there is no titles found. If no titles on the page - throw CloseSpider exception to stop the spider:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from tcgplayer1.items import Tcgplayer1Item
URL = "http://store.tcgplayer.com/magic/journey-into-nyx?pageNumber=%d"
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["tcgplayer.com"]
start_urls = [URL % 1]
def __init__(self):
self.page_number = 1
def parse(self, response):
print self.page_number
print "----------"
sel = Selector(response)
titles = sel.xpath("//div[#class='magicCard']")
if not titles:
raise CloseSpider('No more pages')
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[#class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[#class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[#class='seller']/a/text())").extract()
yield item
self.page_number += 1
yield Request(URL % self.page_number)
This particular spider would go throw all 8 pages of the data, then stop.
Hope that helps.

verify scrapy project code

I try to extract job offers informations from this website and this is my code
from scrapy.spider import Spider
from scrapy.selector import Selector
from tutorial.items import DmozItem
class DmozSpider(Spider):
name = "myspider"
allowed_domains =["tanitjobs.com/"]
start_urls =["http://tanitjobs.com/search-results-jobs/"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="offre"]/div[#class="detail"]')
items = []
item = DmozItem()
for site in sites:
item['title'] = site.xpath('a/text()').extract()
item['link'] = site.xpath('a/#href').extract()
item['desc'] = site.xpath('div[#class="descriptionjob"]/text()').extract()
items.append(item)
return items
but the result is incorrect (empty item list):
{'desc': [],
'link': [u'lien'],
'title': []}
and many blocks like this ...
item = DmozItem() should be called for each loop iteration, otherwise you are rewriting the same item, appending the same item to the items list
It should look like:
from scrapy.spider import Spider
from scrapy.selector import Selector
from tutorial.items import DmozItem
class DmozSpider(Spider):
name = "myspider"
allowed_domains =["tanitjobs.com/"]
start_urls =["http://tanitjobs.com/search-results-jobs/"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="offre"]/div[#class="detail"]')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.xpath('a/text()').extract()
item['link'] = site.xpath('a/#href').extract()
item['desc'] = site.xpath('div[#class="descriptionjob"]/text()').extract()
items.append(item)
return items
Your title xpath didn't take into account the <strong> tags on either side of the text, and your desc xpath needs to go down another div to retrieve the required information.
I just noticed that the xpath for job description varies. The xpath in the code below returns job descriptions for the first three results but not subsequent ones. You would need to examine subsequent results to determine how the xpath changes to retrieve descriptions for those jobs.
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="offre"]/div[#class="detail"]')
items = []
for site in sites:
item = DmozItem()
item['title'] = site.xpath('normalize-space(a/strong/text())').extract()
item['link'] = site.xpath('a/#href').extract()
item['desc'] = site.xpath('normalize-space(./div/div[#class="descriptionjob"]/text())').extract()
items.append(item)
return items

Scrapy only scraping first result of each page

I'm currently trying to run the following code but it keeps scraping only the first result of each page. Any idea what the issue may be?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123Item
import urlparse
from scrapy.http.request import Request
class MySpider(CrawlSpider):
name = "xyz123"
allowed_domains = ["www.xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/",]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//*[#id="1234headerPagination_hlNextLink"]',))
, callback="parse_xyz", follow=True),
)
def parse_xyz(self, response):
hxs = HtmlXPathSelector(response)
xyz = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
The Basespider version works well scraping ALL the required data on the first page:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123
class MySpider(BaseSpider):
name = "xyz123test"
allowed_domains = ["xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for titles in titles:
item = xyz123Item()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
Sorry for the censoring. I had to censor the website for privacy reasons.
The first code crawls through the pages well the way I'd like it to crawl, however it only pulls the first item title and link. NOTE: The XPath of the first title using "inspect element" in google is:
//*[#id="xyz123SearchResults"]/div[1]/h2/a,
second is //*[#id="xyz123SearchResults"]/div[2]/h2/a
third is //*[#id="xyz123SearchResults"]/div[3]/h2/a etc.
I'm not sure if the div[n] bit is what's killing it. I'm hoping it's an easy fix.
Thanks
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
Are you sure about the indentation of the return items ? It should be one less.

Scraper not finding pages

I have a spider written as below, but it doesn't seem to be getting to the function parse. Could someone take a quick look and let me know if I'm missing something. Am I implementing the SgmlLinkExtractor properly?
The spider should pick out all the links from the left sidebar, create a request from them, then parse the next page for a facebook link. It should also do this for other pages as specified in the SgmlLinkExtractor. At the moment, the spider is running, but not parsing any pages.
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (
Rule(
SgmlLinkExtractor(
allow=(r'veranstaltungen-(.*)', ),
),
callback='parse'
),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
print startlinks
for link in startlinks:
giglink = link.select('#href').extract()
item = GigItem()
item['gig_link'] = giglink
request = Request(item['gig_link'], callback='parse_gig_page')
item.meta['item'] = item
yield request
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
EDIT **
settings.py
BOT_NAME = 'gigscraper'
SPIDER_MODULES = ['gigscraper.spiders']
NEWSPIDER_MODULE = 'gigscraper.spiders'
ITEM_PIPLINES = ['gigscraper.pipelines.GigscraperPipeline']
items.py
from scrapy.item import Item, Field
class GigItem(Item):
gig_link = Field()
pipelines.py
class GigscraperPipeline(object):
def process_item(self, item, spider):
print 'here I am in the pipeline'
return item
Two problems:
extract() returns a list, you are missing [0]
Request's callback should not be a string, use self.parse_gig_page
Here's the modified code (working):
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
class GigItem(Item):
gig_link = Field()
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (Rule(SgmlLinkExtractor(allow=(r'veranstaltungen-(.*)',)), callback='parse'),)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
for link in startlinks:
item = GigItem()
item['gig_link'] = link.select('#href').extract()[0]
yield Request(item['gig_link'], callback=self.parse_gig_page, meta={'item': item})
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()[0]
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
Hope that helps.

scrapy: newbie attempting to debug code

Total newbie, trying to get scrapy to read a list of urls from csv and return the items in a csv.
Need some help to figure out where I'm going wrong here:
Spider code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import random
class incyspider(BaseSpider):
name = "incyspider"
def __init__(self):
super(incyspider, self).__init__()
domain_name = "incyspider.co.uk"
f = open("urls.csv")
start_urls = [url.strip() for url in f.readlines()]
f.close
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="Product"]')
items = []
for site in sites:
item['title'] = hxs.select('//div[#class="Name"]/node()').extract()
item['hlink'] = hxs.select('//div[#class="Price"]/node()').extract()
item['price'] = hxs.select('//div[#class="Codes"]/node()').extract()
items.append(item)
return items
SPIDER = incyspider()
Here's the items.py code:
from scrapy.item import Item, Field
class incyspider(Item):
# define the fields for your item here like:
# name = Field()
title = Field()
hlink = Field()
price = Field()
pass
To run, I'm using
scrapy crawl incyspider -o items.csv -t csv
I would seriously appreciate any pointers.
I'm not exactly sure but after a quick look at your code I would say that at least you need to replace this line
sites = hxs.select('//div[#class="Product"]')
by this line
sites = hxs.select('//div[#class="Product"]').extract()
As a first punt at answering this, your spider code is missing an import for your incyspider item class. Also you're not creating an instance of any kind of item to store the title/hlink/price info, so the items.append(item) line might complain.
Since your spider is also called incyspider, you should rename the item to be something like incyspiderItem and then add the following line to your spider code
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import random
from incyspider.items import incyspiderItem
class incyspider(BaseSpider):
name = "incyspider"
def __init__(self):
super(incyspider, self).__init__()
domain_name = "incyspider.co.uk"
f = open("urls.csv")
start_urls = [url.strip() for url in f.readlines()]
f.close
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="Product"]')
items = []
for site in sites:
item = incyspiderItem()
item['title'] = hxs.select('//div[#class="Name"]/node()').extract()
item['hlink'] = hxs.select('//div[#class="Price"]/node()').extract()
item['price'] = hxs.select('//div[#class="Codes"]/node()').extract()
items.append(item)
return items
If I'm wrong, then please edit the question to explain how you know there is a problem with the code eg: is the expected output different to the actual output? If so, how?

Categories

Resources