I am a newbie.
This is my spider:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from ampa.items import AmpaItem
class AmpaSpider(CrawlSpider):
name = "ampa"
allowed_domains = ['website']
start_urls = ['website/page']
rules = (Rule(SgmlLinkExtractor(allow=('associados?', ), deny=('associado/', )), callback='parse_page', follow=True),)
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
item = AmpaItem()
farmers = hxs.select('//div[#class="span-24 tx_left"]')
item['nome'] = farmers.select('//div/h3[#class="titulo"]/a/text()').extract()
item['phone'] = farmers.select('//div/span[#class="chamada"]/a[contains(text(), "Telefone")]/text()').extract()
item['email'] = farmers.select('//div/span[#class="chamada"]/a[contains(text(), "E-mail")]/text()').extract()
print item.values()
return item
This is my pipeline:
class CsvWriterPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('items.csv', 'wb'))
def process_item(self, item, ampa):
self.csvwriter.writerow([item['nome'], item['phone'], item['email']])
return item
Each page of the website has a list of names, phones and e-mails. The code above will output a csv file with three columns and one row for each page. In the first column, each cell is a list of all names in that page, in the second column they are a list of all phones and in the third column they are a list of all e-mails.
What I really want to do is to have each name, phone and e-mail in individual rows. I tried to do it by looping through each item, but it only prints the first name, phone and e-mail on each page. (Is it because callback moves the crawler to the next URL each time the function spider returns an item) (Does it???)
How would you go about that?
Here is the item:
from scrapy.item import Item, Field
class AmpaItem(Item):
nome = Field()
phone = Field()
email = Field()
Based on your use of the plural in farmes, I assume there are many farmers on the page. So you expression will likely return a collection of farmers.
Can you loop through the result of farmers and yield each item?
#pseudocode
hxs = HtmlXPathSelector(response)
farmers = hxs.select('//div[#class="span-24 tx_left"]')
for farmer in farmer:
item = AmpaItem()
#be sure to select only one desired farmer here
item['nome'] = farmers.select('//div/h3[#class="titulo"]/a/text()').extract()
item['phone'] = farmers.select('//div/span[#class="chamada"]/a[contains(text(), "Telefone")]/text()').extract()
item['email'] = farmers.select('//div/span[#class="chamada"]/a[contains(text(), "E-mail")]/text()').extract()
yield item
I found the solution by changing my pipeline:
import csv
import itertools
class CsvWriterPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('items.csv', 'wb'), delimiter=',')
def process_item(self, item, ampa):
for i,n,k in itertools.izip(item['nome'],item['phone'],item['email']):
self.csvwriter.writerow([i,n,k])
return item
Thaks DrColossos and dm03514!!
This was my first question on stackoverflow!!
Related
I want to remove the [ ] brackets scrapy adds to all it's output, to do this you simply add [0] at the end of an xpath statement as follows:
'a[#class="question-hyperlink"]/text()').extract()[0]
this solves the [ ] problem in some cases but in other cases scrapy returns every second row of output as blank and as such the moment it gets to the second row when using [0] i'm given the error:
Index error: list index out of range
How can I prevent scrapy from creating blank rows ? It seems like this is a common problem, but everyone faces this problem when exporting to CSV while for me it's with the scrapy response before exporting as CSV.
Items.py:
import scrapy
from scrapy.item import Item, Field
class QuestionItem(Item):
title = Field()
url = Field()
class PopularityItem(Item):
votes = Field()
answers = Field()
views = Field()
class ModifiedItem(Item):
lastModified = Field()
modName = Field()
The spider that doesn't output every second row as blank and thus works with [0]:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import QuestionItem
class QuestionSpider(Spider):
name = "questions"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"http://stackoverflow.com/questions?pagesize=50&sort=newest",
]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="summary"]/h3')
for question in questions:
item = QuestionItem()
item['title'] = question.xpath(
'a[#class="question-hyperlink"]/text()').extract()[0]
item['url'] = question.xpath(
'a[#class="question-hyperlink"]/#href').extract()[0]
yield item
The spider that gives every second row of output as blank:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import PopularityItem
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"https://stackoverflow.com/",
]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
item = PopularityItem()
item['votes'] = poppart.xpath(
'div[contains(#class, "votes")]//span/text()').extract()#[0]
item['answers'] = poppart.xpath(
'div[contains(#class, "answered")]//span/text()').extract()#[0]
item['views'] = poppart.xpath(
'div[contains(#class, "views")]//span/text()').extract()#[0]
yield item
Pipelines.py
import pymongo
import logging
class StackPipeline(object):
def process_item(self, item, spider):
return item
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
self.db = connection[settings['MONGODB_DB']]
def process_item(self, item, spider):
collection = self.db[type(item).__name__.lower()]
logging.info(collection.insert(dict(item)))
return item
The easiest way to handle an error like this is to catch it and deal with it then (in this case, by just moving on past the blank lines).
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = ["https://stackoverflow.com/"]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
try:
item = PopularityItem()
item['votes'] = poppart.xpath('div[contains(#class, "votes")]//span/text()').extract()[0]
item['answers'] = poppart.xpath('div[contains(#class, "answered")]//span/text()').extract()[0]
item['views'] = poppart.xpath('div[contains(#class, "views")]//span/text()').extract()[0]
except IndexError:
continue
yield item
i'm trying to scrape this site using scrapy but returns all the value in a
single cell, i except each value in a different row.
example:
milage: 25
milage: 377
milage: 247433
milage: 464130
but i'm getting the data like this
example:
milage:[u'25',
u'377',
u'247433',
u'399109',
u'464130',
u'399631',
u'435238',
u'285000',
u'287470',
u'280000']
here is my code
import scrapy
from ..items import ExampleItem
from scrapy.selector import HtmlXPathSelector
url = 'https://example.com'
class Example(scrapy.Spider):
name = 'example'
allowed_domains = ['www.example.com']
start_urls = [url]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item_selector = hxs.select('//div[#class="listing_format card5 relative"]')
for fields in item_selector:
item = ExampleItem()
item ['Mileage'] = fields.select('//li[strong="Mileage"]/span/text()').extract()
yield item
You didn't show your site but may be you need relative XPath:
item ['Mileage'] = fields.select('.//li[strong="Mileage"]/span/text()').extract_first()
It sounds like you need to iterate over your milages.
for fields in item_selector:
milages = fields.select('//li[strong="Mileage"]/span/text()').extract()
for milage in milages:
item = CommercialtrucktraderItem()
item ['Mileage'] = milage
yield item
Also consider making your fields.select('//li[strong="Mileage"]/span/text()').extract() more specific?
I've build my 1st Scrapy project but can't figure out the last hurdle.
With my script below I get one long list in csv. First all the Product Prices and than all the Product Names.
What I would like to achieve is that for every Product the price is next to in.
For example:
Product Name, Product Price
Product Name, Product Price
My scrapy project:
Items.py
from scrapy.item import Item, Field
class PrijsvergelijkingItem(Item):
Product_ref = Field()
Product_price = Field()
My Spider called nvdb.py:
from scrapy.spider import BaseSpider
import scrapy.selector
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(BaseSpider):
name = "nvdb"
allowed_domains = ["vandenborre.be"]
start_urls = ["http://www.vandenborre.be/tv-lcd-led/lcd-led-tv-80-cm-alle-producten"]
def parse(self, response):
hxs = scrapy.Selector(response)
titles = hxs.xpath("//ul[#id='prodlist_ul']")
items = []
for titles in titles:
item = PrijsvergelijkingItem()
item["Product_ref"] = titles.xpath("//div[#class='prod_naam']//text()[2]").extract()
item["Product_price"] = titles.xpath("//div[#class='prijs']//text()[2]").extract()
items.append(item)
return items
You need to switch your XPath expressions to work in the context of every "product". In order to do this, you need to prepend a dot to the expressions:
def parse(self, response):
products = response.xpath("//ul[#id='prodlist_ul']/li")
for product in products:
item = PrijsvergelijkingItem()
item["Product_ref"] = product.xpath(".//div[#class='prod_naam']//text()[2]").extract_first()
item["Product_price"] = product.xpath(".//div[#class='prijs']//text()[2]").extract_first()
yield item
I've also improved the code a little bit:
I assume you meant to iterate over list items ul->li and not just ul - fixed the expression
used the response.xpath() shortcut method
used extract_first() instead of extract()
improved the variable naming
used yield instead of collecting items in a list and then returning
I am not sure if this can help you, but you can use OrderedDict from collections for your need.
from scrapy.spider import BaseSpider
import scrapy.selector
from collections import OrderedDict
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(BaseSpider):
name = "nvdb"
allowed_domains = ["vandenborre.be"]
start_urls = ["http://www.vandenborre.be/tv-lcd-led/lcd-led-tv-80-cm-alle-producten"]
def parse(self, response):
hxs = scrapy.Selector(response)
titles = hxs.xpath("//ul[#id='prodlist_ul']")
items = []
for titles in titles:
item = OrderedDict(PrijsvergelijkingItem())
item["Product_ref"] = titles.xpath("//div[#class='prod_naam']//text()[2]").extract()
item["Product_price"] = titles.xpath("//div[#class='prijs']//text()[2]").extract()
items.append(item)
return items
Also you might have to change the way you iterate dict,
for od in items:
for key,value in od.items():
print key,value
My goal is to extract all 25 rows ( 6 items per row) per page then iterate over each of the 40 pages.
Currently, my spider extracts the first row from page 1-3 (see CSV output image).
I assumed the, list_iterator() function would iterate over each row; however, there appears to be an error in either my rules or list_iterator() function that is not allowing all rows per page to be scrapped.
Any assistance or advice is greatly appreciated!
propub_spider.py:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from propub.items import PropubItem
from scrapy.http import Request
class propubSpider(CrawlSpider):
name = 'prop$'
allowed_domains = ['https://projects.propublica.org']
max_pages = 40
start_urls = [
'https://projects.propublica.org/docdollars/search?state%5Bid%5D=33',
'https://projects.propublica.org/docdollars/search?page=2&state%5Bid%5D=33',
'https://projects.propublica.org/docdollars/search?page=3&state%5Bid%5D=33']
rules = (Rule(SgmlLinkExtractor(allow=('\\search?page=\\d')), 'parse_start_url', follow=True),)
def list_iterator(self):
for i in range(self.max_pages):
yield Request('https://projects.propublica.org/docdollars/search?page=d' % i, callback=self.parse)
def parse(self, response):
for sel in response.xpath('//*[#id="payments_list"]/tbody'):
item = PropubItem()
item['payee'] = sel.xpath('tr[1]/td[1]/a[2]/text()').extract()
item['link'] = sel.xpath('tr[1]/td[1]/a[1]/#href').extract()
item['city'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['state'] = sel.xpath('tr[1]/td[3]/text()').extract()
item['company'] = sel.xpath('tr[1]/td[4]').extract()
item['amount'] = sel.xpath('tr[1]/td[7]/span/text()').extract()
yield item
pipelines.py:
import csv
class PropubPipeline(object):
def __init__(self):
self.myCSV = csv.writer(open('C:\Users\Desktop\propub.csv', 'wb'))
self.myCSV.writerow(['payee', 'link', 'city', 'state', 'company', 'amount'])
def process_item(self, item, spider):
self.myCSV.writerow([item['payee'][0].encode('utf-8'),
item['link'][0].encode('utf-8'),
item['city'][0].encode('utf-8'),
item['state'][0].encode('utf-8'),
item['company'][0].encode('utf-8'),
item['amount'][0].encode('utf-8')])
return item
items.py:
import scrapy
from scrapy.item import Item, Field
class PropubItem(scrapy.Item):
payee = scrapy.Field()
link = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
company = scrapy.Field()
amount = scrapy.Field()
pass
CSV output:
Multiple things need to be fixed:
use start_requests() method instead of list_iterator()
there is a missing % here:
yield Request('https://projects.propublica.org/docdollars/search?page=%d' % i, callback=self.parse)
# HERE^
you don't need CrawlSpider since you are providing the pagination links via start_requests() - use regular scrapy.Spider
it would more reliable if XPath expressions would match the cells by class attributes
Fixed version:
import scrapy
from propub.items import PropubItem
class propubSpider(scrapy.Spider):
name = 'prop$'
allowed_domains = ['projects.propublica.org']
max_pages = 40
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('https://projects.propublica.org/docdollars/search?page=%d' % i, callback=self.parse)
def parse(self, response):
for sel in response.xpath('//*[#id="payments_list"]//tr[#data-payment-id]'):
item = PropubItem()
item['payee'] = sel.xpath('td[#class="name_and_payee"]/a[last()]/text()').extract()
item['link'] = sel.xpath('td[#class="name_and_payee"]/a[1]/#href').extract()
item['city'] = sel.xpath('td[#class="city"]/text()').extract()
item['state'] = sel.xpath('td[#class="state"]/text()').extract()
item['company'] = sel.xpath('td[#class="company"]/text()').extract()
item['amount'] = sel.xpath('td[#class="amount"]/text()').extract()
yield item
I have a spider which reads a list of urls from a text file and saves the title and body text from each. The crawl works but the data does not get saved to csv. I set up a pipeline to save to csv because the normal -o option did not work for me. I did change the settings.py for piepline. Any help with this would be greatly appreciated.
The code is as follows:
Items.py
from scrapy.item import Item, Field
class PrivacyItem(Item):
# define the fields for your item here like:
# name = Field()
title = Field()
desc = Field()
PrivacySpider.py
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from privacy.items import PrivacyItem
class PrivacySpider(CrawlSpider):
name = "privacy"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def parse(self, response):
hxs = HtmlXPathSelector(response)
items =[]
for url in start_urls:
item = PrivacyItem()
item['desc'] = hxs.select('//body//p/text()').extract()
item['title'] = hxs.select('//title/text()').extract()
items.append(item)
return items
Pipelines.py
import csv
class CSVWriterPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('CONTENT.csv', 'wb'))
def process_item(self, item, spider):
self.csvwriter.writerow([item['title'][0], item['desc'][0]])
return item
you don't have to loop on start_urls, scrapy is doing something like this:
for url in spider.start_urls:
request url and call spider.parse() with its response
so your parse function should look something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = PrivacyItem()
item['desc'] = hxs.select('//body//p/text()').extract()
item['title'] = hxs.select('//title/text()').extract()
return item
also try to avoid returning lists as item fields, do something like: hxs.select('..').extract()[0]