Still getting to grips with Scapy and been following this tutorial. Having a little trouble following however as I am getting the following: NameError: name 'DmozItem' is not defined when I run this:
import scrapy
from scrapy import Item, Field
class QuotesItems(scrapy.Item):
area_name = scrapy.Field()
room_type = scrapy.Field()
period = scrapy.Field()
duration_weekly = scrapy.Field()
guide_total = scrapy.Field()
amenities = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "not_quotes"
start_urls = [
'http://www.unitestudents.com/',
]
# Step 1
def parse(self, response):
for city in response.xpath('//select[#id="frm_homeSelect_city"]/option[not(contains(text(),"Select your city"))]/text()').extract(): # Select all cities listed in the select (exclude the "Select your city" option)
yield scrapy.Request(response.urljoin("/"+city), callback=self.parse_citypage)
# Step 2
def parse_citypage(self, response):
for url in response.xpath('//div[#class="property-header"]/h3/span/a/#href').extract(): #Select for each property the url
yield scrapy.Request(response.urljoin(url), callback=self.parse_unitpage)
# Step 3
def parse_unitpage(self, response):
for final in response.xpath('//div/div/div[#class="content__btn"]/a/#href').extract(): #Select final page for data scrape
yield scrapy.Request(response.urljoin(final), callback=self.parse_final)
#Step 4
def parse(self, response):
for sel in response.xpath('//html/body/div'):
item = DmozItem()
item['area_name'] = sel.xpath('//div/ul/li/a/span/text()').extract()
item['room_type'] = sel.xpath('//div/div/div/h1/span/text()').extract()
item['period'] = sel.xpath('/html/body/div/div/section/div/form/h4/span/text()').extract()
item['duration_weekly'] = sel.xpath('//html/body/div/div/section/div/form/div/div/em/text()').extract()
item['guide_total'] = sel.xpath('//html/body/div/div/section/div/form/div/div/p/text()').extract()
item['amenities'] = sel.xpath('//div/div/div/ul/li/p/text()').extract()
yield item
I have set up my items.py file as:
class DmozItem(Item):
area_name = Field()
room_type = Field()
period = Field()
duration_weekly = Field()
guide_total = Field()
amenities = Field()
pass
Not really sure where I am going wrong on this one?
You should import DmozItem
from YourFolderName.items import DmozItem
Related
I have a simple spider that crawls local obituaries. The code works perfectly until I try to add two static columns. All I want to do is add the date I pulled the information (pull item) and the state in which it was pulled (state item). It's a self loading page so when I add the pull date, I only get the first 10 results (or only the first page). If I add just the state, I only get two results. When I remove both, I get all 40+ results.
I did # lines that aren't working properly:
Item.py file:
import scrapy
class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
#pull = scrapy.Field()
#state = scrapy.Field()
spider file:
import scrapy
import time
from al.items import AlItem
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browse?type=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
#pull = time.strftime("%m/%d/%Y")
#state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
#new_item['pull'] = pull
#new_item["state"] = state
yield new_item
I explain why:
if you paste in here for item in zip(name, link, obit, news): pull & state, then you will get the number of iterations equal 2 because state = "AL" - string variable. ZIP function get from state two chars and set iteration = 2 for all arguments in loop. zip gets the smallest numb from arguments for iteration. as with the date 01/01/2001 - 10 chars. (will iterations equal 10)
WILL WORKING:
`class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
pull = scrapy.Field()
state = scrapy.Field()`
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browsetype=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
pull = time.strftime("%m/%d/%Y")
state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
new_item['pull'] = pull
new_item["state"] = state
yield new_item
I want to remove the [ ] brackets scrapy adds to all it's output, to do this you simply add [0] at the end of an xpath statement as follows:
'a[#class="question-hyperlink"]/text()').extract()[0]
this solves the [ ] problem in some cases but in other cases scrapy returns every second row of output as blank and as such the moment it gets to the second row when using [0] i'm given the error:
Index error: list index out of range
How can I prevent scrapy from creating blank rows ? It seems like this is a common problem, but everyone faces this problem when exporting to CSV while for me it's with the scrapy response before exporting as CSV.
Items.py:
import scrapy
from scrapy.item import Item, Field
class QuestionItem(Item):
title = Field()
url = Field()
class PopularityItem(Item):
votes = Field()
answers = Field()
views = Field()
class ModifiedItem(Item):
lastModified = Field()
modName = Field()
The spider that doesn't output every second row as blank and thus works with [0]:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import QuestionItem
class QuestionSpider(Spider):
name = "questions"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"http://stackoverflow.com/questions?pagesize=50&sort=newest",
]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="summary"]/h3')
for question in questions:
item = QuestionItem()
item['title'] = question.xpath(
'a[#class="question-hyperlink"]/text()').extract()[0]
item['url'] = question.xpath(
'a[#class="question-hyperlink"]/#href').extract()[0]
yield item
The spider that gives every second row of output as blank:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import PopularityItem
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"https://stackoverflow.com/",
]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
item = PopularityItem()
item['votes'] = poppart.xpath(
'div[contains(#class, "votes")]//span/text()').extract()#[0]
item['answers'] = poppart.xpath(
'div[contains(#class, "answered")]//span/text()').extract()#[0]
item['views'] = poppart.xpath(
'div[contains(#class, "views")]//span/text()').extract()#[0]
yield item
Pipelines.py
import pymongo
import logging
class StackPipeline(object):
def process_item(self, item, spider):
return item
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
self.db = connection[settings['MONGODB_DB']]
def process_item(self, item, spider):
collection = self.db[type(item).__name__.lower()]
logging.info(collection.insert(dict(item)))
return item
The easiest way to handle an error like this is to catch it and deal with it then (in this case, by just moving on past the blank lines).
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = ["https://stackoverflow.com/"]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
try:
item = PopularityItem()
item['votes'] = poppart.xpath('div[contains(#class, "votes")]//span/text()').extract()[0]
item['answers'] = poppart.xpath('div[contains(#class, "answered")]//span/text()').extract()[0]
item['views'] = poppart.xpath('div[contains(#class, "views")]//span/text()').extract()[0]
except IndexError:
continue
yield item
I am trying to build a spider for a school project where I am scraping recipes from allrecipes.com. Everything is working really well, however I seem to be unable to remove duplicate recipes where one url contains the actual recipe, and the other contains the same url with "video=true" appended.
Here is my attempt to dealing with this in pipelines.py:
from scrapy.exceptions import DropItem
from scrapy import log
class DuplicatesPipeline(object):
# minCal = 50
def __init__(self):
self.urls_seen = set()
def process_vids(self, item, spider):
video = "video=true"
url = str(item.get('url'))
if video in url:
raise DropItem("Contains video")
else:
return item
def process_item(self, item, spider):
unique_id = item.get('url')
if unique_id in self.urls_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.urls_seen.add('url')
return item
settings.py:
# Scrapy settings for dirbot project
BOT_NAME = 'dirbot'
SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
ITEM_PIPELINES = {'dirbot.pipelines.DuplicatesPipeline': 300,}
items.py:
from scrapy.item import Item, Field
class Website(Item):
name = Field()
url = Field()
description = Field()
kcal = Field()
carbs = Field()
fat = Field()
protein = Field()
main = Field()
sugar = Field()
fibre = Field()
author = Field()
rating = Field()
img = Field()
dnot.py:
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
import urlparse
import scrapy
page = "http://allrecipes.com/recipes/main.aspx?Page=%d#recipes"
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["allrecipes.com"]
start_urls = [page % 1]
rules = [Rule(SgmlLinkExtractor(allow=('allrecipes.com'), restrict_xpaths = '//a[contains(.,"NEXT")]'),
callback="parse", follow= True),
]
def __init__(self):
self.page_number = 1
def parse(self, response):
print "-------------------------------------------------"
print self.page_number
print "-------------------------------------------------"
sel = Selector(response)
sites = response.xpath('//div[#id="divGridItemWrapper"]')
items = []
for site in sites:
item = Website()
recipe = response.xpath('//a[contains(#href, "/Recipe/")]/#href').extract()
url = "http://www.allrecipes.com"
for nth in recipe:
go = urlparse.urljoin(url, str(nth))
items.append(item)
for link in go:
yield Request(go, self.recipes)
if self.page_number <= 3:
self.page_number += 1
yield Request(page % self.page_number)
else:
pass
def recipes(self,response):
item = Website()
sel = Selector(response)
recipe = response.xpath('//div[#id="content-wrapper"]')
items = []
print "second page - %s" % response.url
for i in recipe:
item['url'] = response.url
item['description'] = i.xpath('//span[#itemprop="description"]/text()').extract()
item['name'] = i.xpath('//h1[#itemprop="name"]/text()').extract()
item['kcal'] = i.xpath('//ul/li[contains(.,"kcal")]/span/text()').extract()
item['carbs'] = i.xpath('//ul/li[contains(.,"Carbohydrates")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['fat'] = i.xpath('//ul/li[contains(.,"Fat")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['protein'] = i.xpath('//ul/li[contains(.,"Protein")]/following-sibling::li[1]//span[#id="lblNutrientValue"]/text()').extract()
item['main'] = "allrecipes.com"
item['sugar'] = i.xpath('//li/span[#itemprop="sugarContent"]/text()').extract()
item['fibre'] = i.xpath('//li/span[#itemprop="proteinContent"]/text()').extract()
item['author'] = i.xpath('//span[#id="lblUser0"]/text()').extract()
item['rating'] = i.xpath('//div[#class="rating-stars-img"][1]/meta[1][#itemprop="ratingValue"]/#content').extract()
item['img'] = i.xpath('//img[#id="imgPhoto"]/#src').extract()
items.append(item)
yield item
I am a little new with Python, and I'm not sure if I need to convert the item['url'] into a string or not; however I have tried with the "str" and without. I have also tried a few other methods that others have used for doing something similar, but nothing has worked for me so far.
Hoping someone can point me in the right direction. Thanks in advance!
Example:
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1
item['url'] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1&video=true
You need to create a class that implements the process_item method on the pipelines.py file, something like:
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
url = item['url']
u = urlparse(url)
query = parse_qs(u.query)
query.pop('video', None)
u = u._replace(query=urlencode(query, True))
unique_id = urlunparse(u)
if unique_id and unique_id in self.ids_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.ids_seen.add(unique_id)
return item
Then you need to add that class, to settings.py
ITEM_PIPELINES = {
'yourproject.pipelines.DuplicatesPipeline': 300,
}
Also, your process_vids method isn't being used.
let me know if it helps you.
I am new to python programming and using scrapy. I have setup my crawler and so far it was working until I got to the point where I wanted to figure out how to download images. The error I am getting is cannot import name NsiscrapePipeline. I dont know what I am doing wrong and I dont understand some of the documentation as I am new. Please help
Items File
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/#src').extract()
item['url'] = site.select('td[1]/a/#href').extract()
item['image_urls'] = site.select('td/a[3]/img/#data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
settings
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
Pipelines This is where I am unsure if I am missing something
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
error
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline
You tried to pass list, but this function accepts only string. Pass only one element from list (for example list[0]).
Heres my final code thats working. There was two issues
1: I was missing the second backslash that needede to be in the request --> //td[1]/a[3]/img/#data-original
2: I had to check the full URL in which the image would be displayed and join them together which was the main URL or the allowed URL and the image URL.
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/#data-original').extract()]
url.append(urls)
return url
That isn't part of the library :) - at least by looking at their current master branch
I think you're looking for ImagesPipeline
Their example may help! example
p.s. I don't think you custom name the class - at least not by how scapy is designed; i'm reasonably sure you use their class ;)
I am new to python programming and using scrapy. I have setup my crawler and so far it was working until I got to the point where I wanted to figure out how to download images. The error I am getting is cannot import name NsiscrapePipeline. I dont know what I am doing wrong and I dont understand some of the documentation as I am new. Please help
Items File
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
Spider
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/#src').extract()
item['url'] = site.select('td[1]/a/#href').extract()
item['image_urls'] = site.select('td/a[3]/img/#data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
settings
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
Pipelines This is where I am unsure if I am missing something
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
error
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline
You tried to pass list, but this function accepts only string. Pass only one element from list (for example list[0]).
Heres my final code thats working. There was two issues
1: I was missing the second backslash that needede to be in the request --> //td[1]/a[3]/img/#data-original
2: I had to check the full URL in which the image would be displayed and join them together which was the main URL or the allowed URL and the image URL.
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/#data-original').extract()]
url.append(urls)
return url
That isn't part of the library :) - at least by looking at their current master branch
I think you're looking for ImagesPipeline
Their example may help! example
p.s. I don't think you custom name the class - at least not by how scapy is designed; i'm reasonably sure you use their class ;)