Recursive Scraping Craigslist with Scrapy and Python 2.7 - python

I'm having trouble getting the spider to follow the next page of ads without following every link it finds, eventually returning every craigslist page. I've played around with the rule as I know that's where the problem lies, but I either get just the first page, every page on craigslist, or nothing. Any help?
Here's my current code:
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
class PageSpider(CrawlSpider):
name = "cto"
allowed_domains = ["medford.craigslist.org"]
start_urls = ["http://medford.craigslist.org/cto/"]
rules = (
Rule(
SgmlLinkExtractor(allow_domains=("medford.craigslist.org", )),
callback='parse_page', follow=True
),
)
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//div[#class="content"]/p[#class="row"]')
for row in rows:
item = CraigslistSampleItem()
link = row.xpath('.//span[#class="pl"]/a')
item['title'] = link.xpath("text()").extract()
item['link'] = link.xpath("#href").extract()
item['price'] = row.xpath('.//span[#class="l2"]/span[#class="price"]/text()').extract()
url = 'http://medford.craigslist.org{}'.format(''.join(item['link']))
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item

You should specify an allow argument of SgmlLinkExtractor:
allow (a regular expression (or list of)) – a single regular
expression (or list of regular expressions) that the (absolute) urls
must match in order to be extracted. If not given (or empty), it will
match all links.
rules = (
Rule(SgmlLinkExtractor(allow='http://medford.craigslist.org/cto/'),
callback='parse_page', follow=True),
)
This will keep all links under http://medford.craigslist.org/cto/ url.
Hope that helps.

Related

Scrapy: Looping through search results only returns first item

I'm scraping a site by going through the search page, then looping through all results within. However it only seems to be returning the first result for each page. I also don't think it's hitting the start page's results either.
Secondly, the price is returning as some sort of Unicode (£ symbol) - how can I remove it altogether just leaving the price?
'regular_price': [u'\xa38.59'],
Here is the HTML:
http://pastebin.com/F8Lud0hu
Here's the spider:
import scrapy
import random
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from cdl.items import candleItem
class cdlSpider(CrawlSpider):
name = "cdl"
allowed_domains = ["www.xxxx.co.uk"]
start_urls = ['https://www.xxxx.co.uk/advanced_search_result.php']
rules = [
Rule(LinkExtractor(
allow=['advanced_search_result\.php\?sort=2a&page=\d*']),
callback='parse_listings',
follow=True)
]
def parse_listings(self, response):
sel = Selector(response)
urls = sel.css('a.product_img')
for url in urls:
url = url.xpath('#href').extract()[0]
return scrapy.Request(url,callback=self.parse_item)
def parse_item(self, response):
candle = candleItem()
n = response.css('.prod_info_name h1')
candle['name'] = n.xpath('.//text()').extract()[0]
if response.css('.regular_price'):
candle['regular_price'] = response.css('.regular_price').xpath('.//text()').extract()
else:
candle['was_price'] = response.css('.was_price strong').xpath('.//text()').extract()
candle['now_price'] = response.css('.now_price strong').xpath('.//text()').extract()
candle['referrer'] = response.request.headers.get('Referer', None)
candle['url'] = response.request.url
yield candle
Yes it's returning only the first result because of your parse_listing method (you're returning the first url and you should be yielding it). I would do something like:
def parse_listings(self, response):
for url in response.css('a.product_img::attr(href)').extract():
yield Request(url, callback=self.parse_item)
In that case I would even do something like:
class CdlspiderSpider(CrawlSpider):
name = 'cdlSpider'
allowed_domains = ['www.xxxx.co.uk']
start_urls = ['https://www.xxxx.co.uk/advanced_search_result.php']
rules = [
Rule(LinkExtractor(allow='advanced_search_result\.php\?sort=2a&page=\d*')),
Rule(LinkExtractor(restrict_css='a.product_img'), callback='parse_item')
]
def parse_item(self, response):
...
if response.css('.regular_price'):
candle['regular_price'] = response.css('.regular_price::text').re_first(r'\d+\.?\d*')
else:
candle['was_price'] = response.css('.was_price strong::text').re_first(r'\d+\.?\d*')
candle['now_price'] = response.css('.now_price strong::text').re_first(r'\d+\.?\d*')
...
return candle
To remove the £, just replace it with an empty string like this:
pricewithpound = u'\xa38.59'
price = pricewithpound.replace(u'\xa3', '')
To investigate the scrapy issue, can you please provide the HTML source ?

Scrapy: Do not crawl links on other domains page

Below id my spider I created to get all links on NecToday.com for example.
import socket
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class PropertiesItem(scrapy.Item):
# Primary fields
title = scrapy.Field()
url = scrapy.Field()
class NecSpider(CrawlSpider):
name = "NecSpider"
#allowed_domains = ["nectoday.com"]
start_urls = ["http://nectoday.com"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
print(response.url)
item = PropertiesItem()
item["title"] = response.xpath("//title/text()").extract()
item["url"] = response.url
return(item)
This code starts to fetch all links present on site. Some of the pages have YouTube links as well. The problem is that once the first YouTube link is crawled, it starts to crawl other YouTube links referenced from the first YouTube link.
I want to crawl the first YouTube link, but no others. YouTube is just example. Tomorrow that can be another site as well. How can this be achieved?
Why not try something along the lines of this:
start_urls=["http://nectoday.com"]
def parse(self, response):
#parse whatever you need
for url in response.selector.xpath('//#href').extract():
if 'youtube.com' in url:
yield scrapy.Request(url, callback=self.parse_no_follow)
else:
yield scrapy.Request(url, callback=self.parse)
def parse_no_follow(self, response):
#parse whatever you want and not follow anymore links
This will only be scraping from your allowed domain.
class QuotesSpider(CrawlSpider):
name = "your app name"
n=0
allowed_domains = ['domain']
start_urls=['anywebpage']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
QuotesSpider.n=QuotesSpider.n+1
if (len(response.body)>100):
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.body_width = 0
dd = response.body.decode("utf-8")
init=dd.find("<p>")
while init>0:
end = dd.find("</p>", init)
if end>0:
o=h.handle(dd[init:end+4]+"\n")
supersentences=o.split('\n')

Python Recursive Scraping with Scrapy

I'm trying to make a scraper that will pull links, titles, prices and the body of posts on craigslist. I have been able to get the prices, but it returns the price for every listing on the page, not just for the specific row. I am also unable to get it to go to the next page and continue scraping.
This is the tutorial I am using - http://mherman.org/blog/2012/11/08/recursively-scraping-web-pages-with-scrapy/
I've tried suggestions from this thread, but still can't make it work - Scrapy Python Craigslist Scraper
The page I'm trying to scrape is - http://medford.craigslist.org/cto/
In the link price variable, if I remove the // before span[#class="l2"] it returns no prices, but if I leave it there it includes every price on the page.
For the rules, I've tried playing with the class tags but it seems to hang on the first page. I'm thinking I might need separate spider classes?
Here is my code:
#-------------------------------------------------------------------------------
# Name: module1
# Purpose:
#
# Author: CD
#
# Created: 02/03/2014
# Copyright: (c) CD 2014
# Licence: <your licence>
#-------------------------------------------------------------------------------
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import *
import sys
class PageSpider(BaseSpider):
name = "cto"
allowed_domains = ["medford.craigslist.org"]
start_urls = ["http://medford.craigslist.org/cto/"]
rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//span[#class="button next"]' ,))
, callback="parse", follow=True), )
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//span[#class="pl"] | //span[#class="l2"]')
for title in titles:
item = CraigslistSampleItem()
item['title'] = title.select("a/text()").extract()
item['link'] = title.select("a/#href").extract()
item['price'] = title.select('//span[#class="l2"]//span[#class="price"]/text()').extract()
url = 'http://medford.craigslist.org{}'.format(''.join(item['link']))
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
The idea is simple: find all paragraphs in a div with a class="content". Then from every paragraph extract link, text link and a price. Note that select() method is deprecated currentlty, use xpath() instead.
Here's a modified version of parse() method:
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//div[#class="content"]/p[#class="row"]')
for row in rows:
item = CraigslistSampleItem()
link = row.xpath('.//span[#class="pl"]/a')
item['title'] = link.xpath("text()").extract()
item['link'] = link.xpath("#href").extract()
item['price'] = row.xpath('.//span[#class="l2"]/span[#class="price"]/text()').extract()
url = 'http://medford.craigslist.org{}'.format(''.join(item['link']))
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
This is a sample of what I'm getting:
{'description': [u"\n\t\tHave a nice, sturdy, compact car hauler/trailer. May be used for other hauling like equipstment, ATV's and the like, Very solid and in good shape. Parice to sell at only $995. Call Bill at 541 944 2929 top see or Roy at 541 9733421. \n\t"],
'link': [u'/cto/4354771900.html'],
'price': [u'$995'],
'title': [u'compact sturdy car trailer ']}
Hope that helps.

Scrapy only scraping first result of each page

I'm currently trying to run the following code but it keeps scraping only the first result of each page. Any idea what the issue may be?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123Item
import urlparse
from scrapy.http.request import Request
class MySpider(CrawlSpider):
name = "xyz123"
allowed_domains = ["www.xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/",]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//*[#id="1234headerPagination_hlNextLink"]',))
, callback="parse_xyz", follow=True),
)
def parse_xyz(self, response):
hxs = HtmlXPathSelector(response)
xyz = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
The Basespider version works well scraping ALL the required data on the first page:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123
class MySpider(BaseSpider):
name = "xyz123test"
allowed_domains = ["xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for titles in titles:
item = xyz123Item()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
Sorry for the censoring. I had to censor the website for privacy reasons.
The first code crawls through the pages well the way I'd like it to crawl, however it only pulls the first item title and link. NOTE: The XPath of the first title using "inspect element" in google is:
//*[#id="xyz123SearchResults"]/div[1]/h2/a,
second is //*[#id="xyz123SearchResults"]/div[2]/h2/a
third is //*[#id="xyz123SearchResults"]/div[3]/h2/a etc.
I'm not sure if the div[n] bit is what's killing it. I'm hoping it's an easy fix.
Thanks
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
Are you sure about the indentation of the return items ? It should be one less.

Scraper not finding pages

I have a spider written as below, but it doesn't seem to be getting to the function parse. Could someone take a quick look and let me know if I'm missing something. Am I implementing the SgmlLinkExtractor properly?
The spider should pick out all the links from the left sidebar, create a request from them, then parse the next page for a facebook link. It should also do this for other pages as specified in the SgmlLinkExtractor. At the moment, the spider is running, but not parsing any pages.
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (
Rule(
SgmlLinkExtractor(
allow=(r'veranstaltungen-(.*)', ),
),
callback='parse'
),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
print startlinks
for link in startlinks:
giglink = link.select('#href').extract()
item = GigItem()
item['gig_link'] = giglink
request = Request(item['gig_link'], callback='parse_gig_page')
item.meta['item'] = item
yield request
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
EDIT **
settings.py
BOT_NAME = 'gigscraper'
SPIDER_MODULES = ['gigscraper.spiders']
NEWSPIDER_MODULE = 'gigscraper.spiders'
ITEM_PIPLINES = ['gigscraper.pipelines.GigscraperPipeline']
items.py
from scrapy.item import Item, Field
class GigItem(Item):
gig_link = Field()
pipelines.py
class GigscraperPipeline(object):
def process_item(self, item, spider):
print 'here I am in the pipeline'
return item
Two problems:
extract() returns a list, you are missing [0]
Request's callback should not be a string, use self.parse_gig_page
Here's the modified code (working):
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
class GigItem(Item):
gig_link = Field()
class PrinzSpider(CrawlSpider):
name = "prinz"
allowed_domains = ["prinzwilly.de"]
start_urls = ["http://www.prinzwilly.de/"]
rules = (Rule(SgmlLinkExtractor(allow=(r'veranstaltungen-(.*)',)), callback='parse'),)
def parse(self, response):
hxs = HtmlXPathSelector(response)
startlinks = hxs.select("//ul[#id='mainNav2']/li/a")
for link in startlinks:
item = GigItem()
item['gig_link'] = link.select('#href').extract()[0]
yield Request(item['gig_link'], callback=self.parse_gig_page, meta={'item': item})
def parse_gig_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
gig_content = hxs.select("//div[#class='n']/table/tbody").extract()[0]
fb_link = re.findall(r'(?:www.facebook.com/)(.*)', gig_content)
print '********** FB LINK ********', fb_link
return item
Hope that helps.

Categories

Resources