writing itemloader by item to xml or csv using scrapy - python

I am a scrapy newbie and have written below spider. I want write to xml or csv with each row in csv or each item in xml as name,tele,addr.
I am using command:
scrapy crawl abc -o items.csv -t csv
I am looking for output:
name,addr,tele
n1,a1,t1
n2,a2,t2
n3,a3,t3
But I get:
name,addr,tele
n1,n2,n3 a1,a2,a3 t1,t2,t3
Spider Code
import scrapy
from abc.items import abcItem
from scrapy.contrib.loader import ItemLoader
class abcSpider(scrapy.Spider):
name = "abc"
allowed_domains = ["abc.com"]
start_urls = ["abc.com/"]
def parse(self, response):
items = []
l = ItemLoader(item=abcItem(), response=response)
l.add_xpath('name', '//section[#class="abcrp"]/a/#title')
l.add_xpath('tele', '//p[#class="abcw"]/a/#href')
l.add_xpath('addr', '//span[#class="dn"]/text()')
return l.load_item()
items code
import scrapy
class abcItem(scrapy.Item):
name = scrapy.Field()
addr = scrapy.Field()
tele = scrapy.Field()

I was able to resolve this. I used a for loop on a outer tag, that contained my name, addr and tele tags

Related

How to scrape JSON web pages

Hey so I have some experience scraping html but never json and so I need to scrape the following web page using scrapy, http://www.starcitygames.com/buylist/search?search-type=category&id=5061, and I found a tutorial online that uses scrapy along with jmspath to scrape json data from the web. And I got the tutorial to work but I am trying to alter it to work with my website to no luck. No errors but it does not return any data. Any help would be greatly appreciated!
items.py
import scrapy
class NameItem(scrapy.Item):
"""User item definition for jsonplaceholder /LoginSpider endpoint."""
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
LoginSpider.py
import scrapy
import json
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import NameItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, SelectJmes
class UserSpider(scrapy.Spider):
"""Spider to scrape `http://www.starcitygames.com/buylist/search?search-type=category&id=5061`."""
name = 'LoginSpider'
allowed_domains = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
start_urls = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
# dictionary to map UserItem fields to Jmes query paths
jmes_paths = {
'name': 'name',
'condition': 'condition',
'price': 'price',
'rarity': 'rarity',
}
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
for user in jsonresponse:
loader = ItemLoader(item=NameItem()) # create an ItemLoader to populate a NameItem
loader.default_input_processor = MapCompose(str) # apply str conversion on each value
loader.default_output_processor = Join(' ')
for (field, path) in self.jmes_paths.items():
loader.add_value(field, SelectJmes(path)(user))
yield loader.load_item()
The response of this url http://www.starcitygames.com/buylist/search?search-type=category&id=5061has 3 levels:
'Ok'
'search'
'results' ## this contain the data
And results key has multiple values what you should iterate.
Inside the values are the data.
Try this code, I hope you can help.
This is the module items.py
class SoResponseItem(scrapy.Item):
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
This is the spider
import scrapy
import json
from SO_response.items import SoResponseItem
class LoginspiderSpider(scrapy.Spider):
name = 'LoginSpider'
allowed_domains = ['www.starcitygames.com']
url = 'http://www.starcitygames.com/'
def start_requests(self):
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
url = response.urljoin('buylist/search?search-type=category&id=5061')
yield scrapy.Request(url=url, callback=self.parse_data)
def parse_data(self, response):
jsonreponse = json.loads(response.body)
for result in jsonreponse['results']:
for index in range(len(result)):
items = SoResponseItem()
items['name'] = result[index]['name']
items['condition'] = result[index]['condition']
items['price'] = result[index]['price']
items['rarity'] = result[index]['rarity']
yield items
Try in your shell:
scrapy crawl -o jmes.json

No output while scraping using scrapy

I was going to scrap the commentary from espncricnfo website using scrapy and i got output(items.csv) as blank. These are my files.
cricinfo.py (Spider File)
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from crictest.items import CrictestItem
class MySpider(BaseSpider):
name = "cricinfo"
allowed_domains = ["espncricinfo.com/"]
start_urls = ["http://www.espncricinfo.com/champions-league-twenty20-2014/engine/match/763595.html?innings=1;view=commentary/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//td[#class="battingComms" and b]')
for row in rows:
item = CrictestItem()
item['overnum'] = row.select('b/text()').extract()[0]
item['overnumtext'] = row.select('b/following-sibling::text()').extract()[0]
yield item
items.py
import scrapy
class CrictestItem(scrapy.Item):
overnum = scrapy.Field()
overnumtext = scrapy.Field()
the problem is your xpath
you can try using this in chrome:
$x('//*[#id="commInnings"]/div[2]/div/div')
in your code rewrite the code in:
rows = hxs.select('//td[#class="battingComms" and b]')
i can't get any output in the console

Pass input file to scrapy containing a list of domains to be scraped

I saw this link [a link] (Pass Scrapy Spider a list of URLs to crawl via .txt file)!
This changes the list of start urls. I want to scrape webpages for each domain(from a file) and put results into a separate file(named after the domain).
I have scraped data for a website but I have specified the start url and allowed_domains in the spider itself. How to change this using input file.
Update 1:
This is the code that I tried:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = []
start_urls = []
def __init__(self):
for line in open('./domains.txt', 'r').readlines():
self.allowed_domains.append(line)
self.start_urls.append('http://%s' % line)
rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
rsslinks = sel.select('//a[contains(#href, "pdf")]/#href').extract()
items = []
for rss in rsslinks:
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = rsslinks
items.append(item)
filename = response.url.split("/")[-2]
open(filename+'.csv', 'wb').write(items)
I get an error when I run this: AttributeError: 'AppleSpider' object has no attribute '_rules'
You can use __init__ method of spider class to read file and owerrite start_urls and allowed_domains.
Suppose we have file domains.txt with content:
example1.com
example2.com
...
Example:
class MySpider(BaseSpider):
name = "myspider"
allowed_domains = []
start_urls = []
def __init__(self):
for line in open('./domains.txt', 'r').readlines():
self.allowed_domains.append(line)
self.start_urls.append('http://%s' % line)
def parse(self, response):
# here you will get data parsing page
# than put your data into single file
# from scrapy toturial http://doc.scrapy.org/en/latest/intro/tutorial.html
filename = response.url.split("/")[-2]
open(filename, 'wb').write(your_data)

Scrape using multiple POST data from the same URL

I have already created one spider that collects a list of company names with matching phone numbers. This is then saved to a CSV file.
I am then wanting to scrape data from another site using the phones numbers in the CSV file as POST data. I am wanting it to loop through the same start URL but just scraping the data that each phone number produces until there are no more numbers left in the CSV file.
This is what I have got so far:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from scrapy import log
import sys
from scrapy.shell import inspect_response
from btw.items import BtwItem
import csv
class BtwSpider(BaseSpider):
name = "btw"
allowed_domains = ["siteToScrape.com"]
start_urls = ["http://www.siteToScrape.com/broadband/broadband_checker"]
def parse(self, response):
phoneNumbers = ['01253873647','01253776535','01142726749']
return [FormRequest.from_response(response,formdata={'broadband_checker[phone]': phoneNumbers[1]},callback=self.after_post)]
def after_post(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#id="results"]')
items = []
for site in sites:
item = BtwItem()
fttcText = site.select("div[#class='content']/div[#id='btfttc']/ul/li/text()").extract()
# Now we will change the text to be a boolean value
if fttcText[0].count('not') > 0:
fttcEnabled=0
else:
fttcEnabled=1
item['fttcAvailable'] = fttcEnabled
items.append(item)
return items
At the minute I have just been trying to get this looping through a list(phoneNumbers) but I have not even managed to get that to work so far. Once I know how to do that I will be able to get it to pull it from a CSV file by myself. In its current state it is just using the phoneNumber with a index of 1 in the list.
Assuming you have a phones.csv file with phones in it:
01253873647
01253776535
01142726749
Here's your spider:
import csv
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
class BtwItem(Item):
fttcAvailable = Field()
phoneNumber = Field()
class BtwSpider(BaseSpider):
name = "btw"
allowed_domains = ["samknows.com"]
def start_requests(self):
yield Request("http://www.samknows.com/broadband/broadband_checker", self.parse_main_page)
def parse_main_page(self, response):
with open('phones.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
phone_number = row[0]
yield FormRequest.from_response(response,
formdata={'broadband_checker[phone]': phone_number},
callback=self.after_post,
meta={'phone_number': phone_number})
def after_post(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#id="results"]')
phone_number = response.meta['phone_number']
for site in sites:
item = BtwItem()
fttc = site.select("div[#class='content']/div[#id='btfttc']/ul/li/text()").extract()
item['phoneNumber'] = phone_number
item['fttcAvailable'] = 'not' in fttc[0]
yield item
Here's what was scraped after running it:
{'fttcAvailable': False, 'phoneNumber': '01253873647'}
{'fttcAvailable': False, 'phoneNumber': '01253776535'}
{'fttcAvailable': True, 'phoneNumber': '01142726749'}
The idea is to scrape the main page using start_requests, then read the csv file line-by-line in the callback and yield new Requests for each phone number (csv row). Additionally, pass phone_number to the callback through the meta dictionary in order to write it to the Item field (I think you need this to distinguish items/results).
Hope that helps.

scrapy: newbie attempting to debug code

Total newbie, trying to get scrapy to read a list of urls from csv and return the items in a csv.
Need some help to figure out where I'm going wrong here:
Spider code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import random
class incyspider(BaseSpider):
name = "incyspider"
def __init__(self):
super(incyspider, self).__init__()
domain_name = "incyspider.co.uk"
f = open("urls.csv")
start_urls = [url.strip() for url in f.readlines()]
f.close
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="Product"]')
items = []
for site in sites:
item['title'] = hxs.select('//div[#class="Name"]/node()').extract()
item['hlink'] = hxs.select('//div[#class="Price"]/node()').extract()
item['price'] = hxs.select('//div[#class="Codes"]/node()').extract()
items.append(item)
return items
SPIDER = incyspider()
Here's the items.py code:
from scrapy.item import Item, Field
class incyspider(Item):
# define the fields for your item here like:
# name = Field()
title = Field()
hlink = Field()
price = Field()
pass
To run, I'm using
scrapy crawl incyspider -o items.csv -t csv
I would seriously appreciate any pointers.
I'm not exactly sure but after a quick look at your code I would say that at least you need to replace this line
sites = hxs.select('//div[#class="Product"]')
by this line
sites = hxs.select('//div[#class="Product"]').extract()
As a first punt at answering this, your spider code is missing an import for your incyspider item class. Also you're not creating an instance of any kind of item to store the title/hlink/price info, so the items.append(item) line might complain.
Since your spider is also called incyspider, you should rename the item to be something like incyspiderItem and then add the following line to your spider code
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
import random
from incyspider.items import incyspiderItem
class incyspider(BaseSpider):
name = "incyspider"
def __init__(self):
super(incyspider, self).__init__()
domain_name = "incyspider.co.uk"
f = open("urls.csv")
start_urls = [url.strip() for url in f.readlines()]
f.close
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="Product"]')
items = []
for site in sites:
item = incyspiderItem()
item['title'] = hxs.select('//div[#class="Name"]/node()').extract()
item['hlink'] = hxs.select('//div[#class="Price"]/node()').extract()
item['price'] = hxs.select('//div[#class="Codes"]/node()').extract()
items.append(item)
return items
If I'm wrong, then please edit the question to explain how you know there is a problem with the code eg: is the expected output different to the actual output? If so, how?

Categories

Resources