ItemLoader in Scrapy - python

I can't seem to get the ItemLoader to work. I don't get any errors in the scrapy log, just nothing gets extracted. Any ideas would be helpful!
import scrapy
from medium.items import MediumItem
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider
import logging
from scrapy.utils.log import configure_logging
class DataSpider(CrawlSpider):
custom_settings = {
'LOG_FILE': 'my_log.log',
'LOG_LEVEL': 'ERROR'}
logging.getLogger().addHandler(logging.StreamHandler())
name = 'data'
allowed_domains = ['medium.com', 'towardsdatascience.com']
start_urls = ['https://medium.com/tag/python/archive/02/01']
handle_httpstatus_list = [302]
def parse(self,response):
articles = response.xpath('//div[#class="postArticle postArticle--short js-postArticle js-
trackPostPresentation js-trackPostScrolls"]')
for article in articles:
if article.xpath('.//a[#class="button button--smaller button--chromeless u-baseColor--
buttonNormal"]/#href').extract_first():
l = ItemLoader(item = MediumItem(), selector = article)
l.default_output_processor = scrapy.loader.processors.TakeFirst()
l.add_css('Title','div > h3::text')
l.add_xpath('Name','.//a[#class="ds-link ds-link--styleSubtle link link--darken link-
-accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
l.add_css('Read','span::attr(title)')
l.add_xpath('Publication', './/a[#class="ds-link ds-link--styleSubtle link--darken
link--accent u-accentColor--textNormal"]/text()')
l.add_xpath('Claps','.//button[#class="button button--chromeless u-baseColor--
buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
l.add_xpath('Responses','.//a[#class="button button--chromeless u-baseColor--
buttonNormal"]/text()')
l.add_value('Page',response.url)
yield l.load_item()
The Items file is
import scrapy
from scrapy.item import Item, Field
class MediumItem(Item):
Title = scrapy.Field()
Name = scrapy.Field()
Date = scrapy.Field()
Read = scrapy.Field()
Publication = scrapy.Field()
Claps = scrapy.Field()
Responses = scrapy.Field()

At start I get two problems
it needs
Page = scrapy.Field()
page https://medium.com/tag/python/archive/02/01 is redirected to https://medium.com/tag/python/archive but it is blocked by
handle_httpstatus_list = [302]
After removing handle_httpstatus_list I get data from first page
Result (csv)
Claps,Date,Name,Page,Publication,Read,Responses,Title
81K,,Daniel van Flymen,https://medium.com/tag/python/archive,,9 min read,383 responses,Learn Blockchains by Building One
25K,,Jonny Fox,https://medium.com/tag/python/archive,,6 min read,63 responses,Regex tutorial — A quick cheatsheet by examples
9.6K,,Susan Li,https://medium.com/tag/python/archive,,9 min read,112 responses,"Building A Logistic Regression in Python, Step by Step"
5.8K,,Adi Bronshtein,https://medium.com/tag/python/archive,,9 min read,46 responses,Train/Test Split and Cross Validation in Python
7.8K,,Will Koehrsen,https://medium.com/tag/python/archive,,21 min read,42 responses,Random Forest in Python
7.2K,,Ted Petrou,https://medium.com/tag/python/archive,,24 min read,34 responses,Selecting Subsets of Data in Pandas: Part 1
11.1K,,Milo Spencer-Harper,https://medium.com/tag/python/archive,,6 min read,86 responses,How to build a simple neural network in 9 lines of Python code
5.2K,,Michael Galarnyk,https://medium.com/tag/python/archive,,8 min read,27 responses,PCA using Python (scikit-learn)
64K,,TK,https://medium.com/tag/python/archive,,11 min read,148 responses,Learning Python: From Zero to Hero
6.9K,,Susan Li,https://medium.com/tag/python/archive,,9 min read,75 responses,An End-to-End Project on Time Series Analysis and Forecasting with Python
Code which I used - all in one file without creating project
import scrapy
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider
import logging
from scrapy.utils.log import configure_logging
class MediumItem(scrapy.Item):
Title = scrapy.Field()
Name = scrapy.Field()
Date = scrapy.Field()
Read = scrapy.Field()
Publication = scrapy.Field()
Claps = scrapy.Field()
Responses = scrapy.Field()
Page = scrapy.Field()
class DataSpider(CrawlSpider):
custom_settings = {
'LOG_FILE': 'my_log.log',
'LOG_LEVEL': 'ERROR'}
logging.getLogger().addHandler(logging.StreamHandler())
name = 'data'
allowed_domains = ['medium.com', 'towardsdatascience.com']
start_urls = ['https://medium.com/tag/python/archive/02/01']
#handle_httpstatus_list = [302]
def parse(self,response):
print('url:', response.url)
articles = response.xpath('//div[#class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]')
for article in articles:
if article.xpath('.//a[#class="button button--smaller button--chromeless u-baseColor--buttonNormal"]/#href').extract_first():
l = ItemLoader(item = MediumItem(), selector = article)
l.default_output_processor = scrapy.loader.processors.TakeFirst()
l.add_css('Title','div > h3::text')
l.add_xpath('Name','.//a[#class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
l.add_css('Read','span::attr(title)')
l.add_xpath('Publication', './/a[#class="ds-link ds-link--styleSubtle link--darkenlink--accent u-accentColor--textNormal"]/text()')
l.add_xpath('Claps','.//button[#class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
l.add_xpath('Responses','.//a[#class="button button--chromeless u-baseColor--buttonNormal"]/text()')
l.add_value('Page', response.url)
yield l.load_item()
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(DataSpider)
c.start()

Related

I implemented an ItemLoader in my scrapy project to format that data and it is no longer adding anything to the csv file

I create a scrapy project to scrape a few information off this classifieds website, however the data I was getting needed to be formatted. After doing some research I figured out how to implement an ItemLoader but now it does not write any scraped data to the csv file.
Here's my spider.py:
import scrapy
from..items import TestItem
from scrapy.loader import ItemLoader
class TestSpiderSpider(scrapy.Spider):
name = 'test'
page_number = 2
start_urls = ['https://jamaicaclassifiedonline.com/auto/cars/']
def parse(self, response):
for car in response.css('.col.l3.s12.m6'):
items = TestItem()
product_title = car.css('.jco-card-title::text').extract()
product_imagelink = car.css('.card-image img::attr(data-src)').getall()
urls = car.css('.card-image a::attr(href)').getall()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
if product_title and product_imagelink:
items['urls'] = urls
def parse_details(self, response):
l= ItemLoader(item=TestItem(), selector=response)
l.add_css('product_title','#title::text')
yield l.load_item()
pass
Here's my items.py
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst
from w3lib.html import remove_tags
class TestItem(scrapy.Item):
product_title = scrapy.Field(input_processors= MapCompose(remove_tags),output_processor= TakeFirst())
pass
Here's my setting.py:
BOT_NAME = 'test'
SPIDER_MODULES = ['test.spiders']
NEWSPIDER_MODULE = 'test.spiders'
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
}
Here's my pipeline.py:
class TestPipeline:
def process_item(self, item, spider):
return item
You don't need pipelines enabled to use ItemLoader, try without.

How to scrape JSON web pages

Hey so I have some experience scraping html but never json and so I need to scrape the following web page using scrapy, http://www.starcitygames.com/buylist/search?search-type=category&id=5061, and I found a tutorial online that uses scrapy along with jmspath to scrape json data from the web. And I got the tutorial to work but I am trying to alter it to work with my website to no luck. No errors but it does not return any data. Any help would be greatly appreciated!
items.py
import scrapy
class NameItem(scrapy.Item):
"""User item definition for jsonplaceholder /LoginSpider endpoint."""
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
LoginSpider.py
import scrapy
import json
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import NameItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, SelectJmes
class UserSpider(scrapy.Spider):
"""Spider to scrape `http://www.starcitygames.com/buylist/search?search-type=category&id=5061`."""
name = 'LoginSpider'
allowed_domains = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
start_urls = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
# dictionary to map UserItem fields to Jmes query paths
jmes_paths = {
'name': 'name',
'condition': 'condition',
'price': 'price',
'rarity': 'rarity',
}
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
for user in jsonresponse:
loader = ItemLoader(item=NameItem()) # create an ItemLoader to populate a NameItem
loader.default_input_processor = MapCompose(str) # apply str conversion on each value
loader.default_output_processor = Join(' ')
for (field, path) in self.jmes_paths.items():
loader.add_value(field, SelectJmes(path)(user))
yield loader.load_item()
The response of this url http://www.starcitygames.com/buylist/search?search-type=category&id=5061has 3 levels:
'Ok'
'search'
'results' ## this contain the data
And results key has multiple values what you should iterate.
Inside the values are the data.
Try this code, I hope you can help.
This is the module items.py
class SoResponseItem(scrapy.Item):
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
This is the spider
import scrapy
import json
from SO_response.items import SoResponseItem
class LoginspiderSpider(scrapy.Spider):
name = 'LoginSpider'
allowed_domains = ['www.starcitygames.com']
url = 'http://www.starcitygames.com/'
def start_requests(self):
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
url = response.urljoin('buylist/search?search-type=category&id=5061')
yield scrapy.Request(url=url, callback=self.parse_data)
def parse_data(self, response):
jsonreponse = json.loads(response.body)
for result in jsonreponse['results']:
for index in range(len(result)):
items = SoResponseItem()
items['name'] = result[index]['name']
items['condition'] = result[index]['condition']
items['price'] = result[index]['price']
items['rarity'] = result[index]['rarity']
yield items
Try in your shell:
scrapy crawl -o jmes.json

No output while scraping using scrapy

I was going to scrap the commentary from espncricnfo website using scrapy and i got output(items.csv) as blank. These are my files.
cricinfo.py (Spider File)
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from crictest.items import CrictestItem
class MySpider(BaseSpider):
name = "cricinfo"
allowed_domains = ["espncricinfo.com/"]
start_urls = ["http://www.espncricinfo.com/champions-league-twenty20-2014/engine/match/763595.html?innings=1;view=commentary/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//td[#class="battingComms" and b]')
for row in rows:
item = CrictestItem()
item['overnum'] = row.select('b/text()').extract()[0]
item['overnumtext'] = row.select('b/following-sibling::text()').extract()[0]
yield item
items.py
import scrapy
class CrictestItem(scrapy.Item):
overnum = scrapy.Field()
overnumtext = scrapy.Field()
the problem is your xpath
you can try using this in chrome:
$x('//*[#id="commInnings"]/div[2]/div/div')
in your code rewrite the code in:
rows = hxs.select('//td[#class="battingComms" and b]')
i can't get any output in the console

writing itemloader by item to xml or csv using scrapy

I am a scrapy newbie and have written below spider. I want write to xml or csv with each row in csv or each item in xml as name,tele,addr.
I am using command:
scrapy crawl abc -o items.csv -t csv
I am looking for output:
name,addr,tele
n1,a1,t1
n2,a2,t2
n3,a3,t3
But I get:
name,addr,tele
n1,n2,n3 a1,a2,a3 t1,t2,t3
Spider Code
import scrapy
from abc.items import abcItem
from scrapy.contrib.loader import ItemLoader
class abcSpider(scrapy.Spider):
name = "abc"
allowed_domains = ["abc.com"]
start_urls = ["abc.com/"]
def parse(self, response):
items = []
l = ItemLoader(item=abcItem(), response=response)
l.add_xpath('name', '//section[#class="abcrp"]/a/#title')
l.add_xpath('tele', '//p[#class="abcw"]/a/#href')
l.add_xpath('addr', '//span[#class="dn"]/text()')
return l.load_item()
items code
import scrapy
class abcItem(scrapy.Item):
name = scrapy.Field()
addr = scrapy.Field()
tele = scrapy.Field()
I was able to resolve this. I used a for loop on a outer tag, that contained my name, addr and tele tags

Scrapy: one row per item

I'm using scrapy to scrape category pages for products from architonic*com. However, I would like to display these products in a csv one per row. In the current situation all the brand names from a given category page are listed under 'brand' while I would like to have an output like this:
{'brand': [u'Elisabeth Ellefsen'],
'title': [u'Up chair I 907'],
'img_url': [u'http://image.architonic.com/img_pro1-1/117/4373/t-up-06f-sq.jpg'],
'link': [u'http://www.architonic.com/pmsht/up-chair-tonon/1174373']
}
I tried playing with the Item Loaders (added default_output_processor= TakeFirst()), adding 'yield item' (see commented code) and searched two days to find a solution without luck. Hoping someone is willing to help me. Any help is really appreciated.
My output looks like this:
2013-01-14 11:53:23+0100 [archi] DEBUG: Scraped from <200 http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/3>
{'brand': [u'Softline',
u'Elisabeth Ellefsen',
u'Sellex',
u'Lievore Altherr Molina',
u'Poliform',
.....
u'Hans Thyge & Co.'],
'img_url': [u'http://image.architonic.com/img_pro1-1/117/3661/terra-h-sq.jpg',
u'http://image.architonic.com/img_pro1-1/117/0852/fly-01-sq.jpg',
u'http://image.architonic.com/img_pro1-1/116/9870/ley-0004-sq.jpg',
u'http://image.architonic.com/img_pro1-1/117/1023/arflex-hollywood-03-sq.jpg',
...
u'http://image.architonic.com/img_pro1-1/118/5357/reef-002-sq.jpg'],
'link': [u'http://www.architonic.com/pmsht/terra-softline/1173661',
u'http://www.architonic.com/pmsht/fly-sellex/1170852',
u'http://www.architonic.com/pmsht/ley-poliform/1169870',
.....
u'http://www.architonic.com/pmsht/reef-collection-labofa/1185357'],
'title': [u'Terra',
u'Fly',
u'Ley chair',
.....
u'Hollywood Sofa',
u'Pouff Round']}
I'm using this in spider/archi_spider.py
import string
import re
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.utils.markup import remove_entities
from archiscraper.items import ArchiItemFields, ArchiLoader
class ArchiScraper(BaseSpider):
name = "archi"
allowed_domains = ["architonic.com"]
start_urls = ['http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/%s' % page for page in xrange(1, 4)]
# rules = (Rule(SgmlLinkExtractor(allow=('.', ),restrict_xpaths=('//*[#id="right_arrow"]',))
# , callback="parse_items", follow= True),
# )
#
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//li[contains(#class, "nav_pro_item")]')
items = []
for site in sites:
item = ArchiLoader(ArchiItemFields(), site)
item.add_xpath('brand', '//*[contains(#class, "nav_pro_text")]/a/br/following-sibling::node()[1][self::text()]')
item.add_xpath('designer', '//*[contains(#class, "nav_pro_text")]/a/br/following-sibling::node()[3][self::text()]')
item.add_xpath('title', '//*[contains(#class, "nav_pro_text")]/a/strong/text()')
item.add_xpath('img_url', '//li[contains(#class, "nav_pro_item")]/div/a/img/#src[1]')
item.add_xpath('link', '//*[contains(#class, "nav_pro_text")]/a/#href')
items.append(item.load_item())
return items
# for item in items:
# yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
import string
from scrapy.item import Item, Field
from scrapy.contrib.loader.processor import MapCompose, Join, TakeFirst
from scrapy.utils.markup import remove_entities
from scrapy.contrib.loader import XPathItemLoader
class ArchiItem():
pass
class ArchiItemFields(Item):
brand = Field()
title = Field()
designer = Field()
img_url = Field()
img = Field()
link = Field()
pass
class ArchiLoader(XPathItemLoader):
# default_input_processor = MapCompose(unicode.strip)
# default_output_processor= TakeFirst()
brand_out = MapCompose(unicode.strip)
# title_out = Join()

Categories

Resources