scrapy json output all items on one line - python

I'm trying to get my output to look like the following in json format.
{"loser": "De Schepper K." ,"winner": "Herbert P.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
But I'm currently getting individual lines for each loser item and winner item. I would like both winner and loser to be on the same line with the url.
{"loser": "De Schepper K.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
{"winner": "Herbert P.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
{"loser": "Sugita Y.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
I'm not sure if it's my selectors that's causing this behaviour but I'd like to know how I can customise the pipelines so the loser, winner and date are all on the same json line
I've never extracted json format before so it's new to me.
How do you specify what json keys and values will be on each line using custom pipeline?
I also tried to use csv item exporter to do this and got strange behaviour too. ref
Scrapy output is showing empty rows per column
Here's my spider.py
import scrapy
from scrapy_splash import SplashRequest
from scrapejs.items import SofascoreItemLoader
from scrapy import Spider
import json
from scrapy.http import Request, FormRequest
class MySpider(scrapy.Spider):
name = "jsscraper"
start_urls = ["https://www.sofascore.com/tennis/2018-02-07"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait': 1.5})
def parse(self, response):
for row in response.css('.event-team'):
il = SofascoreItemLoader(selector=row)
il.add_css('winner' , '.event-team:nth-
child(2)::text')
il.add_css('loser' , '.event-team:nth-
child(1)::text')
il.add_value('url', response.url)
yield il.load_item()
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose
from operator import methodcaller
from scrapy import Spider, Request, Selector
class SofascoreItem(scrapy.Item):
loser = scrapy.Field()
winner = scrapy.Field()
url = scrapy.Field()
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
pipeline.py
import json
import codecs
from collections import OrderedDict
class JsonPipeline(object):
def __init__(self):
self.file = codecs.open('data_utf8.json' , 'w' ,
encoding='utf-8')
def process_item(self , item , spider):
line = json.dumps(OrderedDict(item) , ensure_ascii=False ,
sort_keys=False) + "\n"
self.file.write(line)
return item
def close_spider(self , spider):
self.file.close()

The problem here is that you're looping over .event-team elements.
One of these elements can only be the winner or the loser, so you get an item for each.
What you should be doing instead is loop over elements containing both (.list-event seems like a good candidate), and extract both the winner and loser from those.
This way, you'd have one loop per event, and as a result, one item per event.

So I relooked your question, and I now find where the problem is:
for row in response.css('.event-team'):
With the above line, you will get many Selectors(or a SelectorList). However, in each Selector or row, you can only get one Field: winner or loser. You can't get them both.
That's why there will be empty rows in your output.
Solution: try the following line:
for row in response.css('div[class=“cell__section--main s-tennisCell curb-width”]')

Related

Scrapy only returning a single item from list

I believe that I have my xpaths coded in the incorrect way, as I only get a single result for each url. Whereas, there are in total 25 job posts for each url (not included those in the next page.) How can I correct my xpaths to get all the results?
Here's my scraper:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
callback = self.parse,
cb_kwargs = {
'items':items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]')
for lists in container:
loader = ItemLoader(CvItem(), selector = lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()
There was a slight mistake with the requests that I updated for those of you that had checked for the first 15minutes since I uploaded it.
The problem was in the container's xpath. You only get the container without actually the items in it so you only loop once on the container itself and not the actual items you want to scrape.
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
title = Field(output_processor=TakeFirst())
organisation = Field(output_processor=TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url=url,
cb_kwargs={
'items': items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]//li[#class="results__item"]')
for lists in container:
loader = ItemLoader(CvItem(), selector=lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()

using regex on scrapy item loader

I'm trying to figure out how to use regex with scrapy item loaders.
I've tried to use a lambda function with split() and got the following error.
Split cannot be defined. You can see the function is commented out in the item loader class.
What I'm trying to do is remove all the text before the date including the "/"
of the date item. Date item being the url that I've just parsed
"https://www.sofascore.com/tennis/2018-02-07"
How do I use regex with scrapy item loaders?
Can I pass in the regex to the item loader or do I have to process it at the spider?
spider.py
import scrapy
from scrapy_splash import SplashRequest
from scrapejs.items import SofascoreItemLoader
from scrapy import Spider
import json
from scrapy.http import Request, FormRequest
class MySpider(scrapy.Spider):
name = "jsscraper"
start_urls = ["https://www.sofascore.com/tennis/2018-02-07"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait': 1.5})
def parse(self, response):
for row in response.css('.event-team'):
il = SofascoreItemLoader(selector=row)
il.add_css('winner' , '.event-team:nth-child(2)::text')
il.add_css('loser' , '.event-team:nth-child(1)::text')
il.add_value('date', response.url)
yield il.load_item()
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Split
from operator import methodcaller
from scrapy import Spider, Request, Selector
class SofascoreItem(scrapy.Item):
loser = scrapy.Field()
winner = scrapy.Field()
date = scrapy.Field()
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
#review_in = MapCompose(lambda x: x.split("/" , [-1]))
il.add_value('date', response.url, re='([^/]+)$')
See https://doc.scrapy.org/en/latest/topics/loaders.html for more details
Here is what is wrong with the code.
Apparently you do not have to 'feed' item loader with add_value but you won't get field populated in the end.
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.split("/")[-1])
You have to do split and then select last item in list that is generated by splitting. split(SEPARATOR, [-1]) is not what you want. Second argument is for selecting in how many parts you want string to be split.
Second, you want to add url value to review field, right?
This is not an answer how to use regex in scrapy ItemLoader, but you do not need it here. You just need to properly use split method.

Scrapy: How to populate an item with data from two websites

I'd like to collect an item's data from 2 different websites.
It should work as follows:
parse_website_1 fetches a persons name from website_1 and populates
the item
parse_website_1 yields a request for parse_website_2
parse_website_2 parses website_2, collects the persons hair-color, based on the person name which was scraped from website_1 and populates the item
parse_website_2 loads the item
Would this be in the right direction, given that the item is defined in items.py:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def __init__(self):
self.item = ItemLoader(item=MyItem(), response=response)
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
name = response.xpath('//div[#class="name"]/text()').extract_first()
self.item.add_value("name", name)
website_2_path = "http://website_2.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2)
def parse_website_2(self, response):
self.item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield self.item.load_item()
The idea is right, but the implementation is not correct in that you are trying to pass data between consecutive requests using an instance attribute (self.item). Scrapy requests are asynchronous so it would not work as expected.
The correct way how to do it is outlined in Scrapy FAQ. Pass the partial item data to consecutive request using Request's meta attribute where you obtain it using Response's meta attribute, add some more data and finally yield the item. Here's the adapted code:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
item = ItemLoader(item=MyItem(), response=response)
name = response.xpath('//div[#class="name"]/text()').extract_first()
item.add_value("name", name)
website_2_path = "http://website_1.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2, meta={'item': item})
def parse_website_2(self, response):
item = response.meta['item']
item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield item.load_item()

Scrapy: Scraping very select URLs

I am trying to scrape yahoo stocks for a school project, but I have no idea how to go through each link of a page with a very certain link. The goal is to iterate through each stock with a certain ending portion of the url like so:
Starting URL = ["https://ca.finance.yahoo.com/q/hp?s=BMO.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"]
The next URL would be something like:
#Canadian Imperial(note the "CM"):
"https://ca.finance.yahoo.com/q/hp?s=CM.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
#Blackberry (note the "BB"):
"https://ca.finance.yahoo.com/q/hp?s=BB.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
etc...
In other words, the only thing that would change would be the characters between "hp?s=" and ".TO&a".
Wondering if this is possible or not. The ending portion of the URL must stay the same as that is the page I need to get to. Unfortuantely, there is no links within each page on yahoo to go to other stocks.
If I could do this with Scrapy's Rules and SmglLinkExtractor, that would be preferable.
Would appreciate any help!
Thanks!
Current Scrapy code:
from scrapy.spider import Spider
from scrapy.selector import Selector
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["ca.finance.yahoo.com"]
start_urls = [
"https://ca.finance.yahoo.com/q/hp?s=BMO.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
]
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"), follow=True)
]
def parse(self, response):
item = Website()
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
print item['name']
Make a rule to follow the links matching the pattern:
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.\w+&a=\d+&b=\d+&c=\d+&d=\d+&e=\d+&f=\d+&g=m"), follow=True)
]
Though, I am not sure that you need to check for all URL parameters here. Simplified version:
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.\w+"), follow=True)
]
And, don't forget the imports:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
Here's an example of what I was talking about in the comments i left.
import urllib
import os
company_symbol = ["ACGL", "AFSI", "AGII", "AGNC", "ANAT", "ARCP", "ASBC", "ASPS", "BANF", "BBCN", "BGCP", "BNCL", "BOKF", "BPOP", "BRKL", "CACC", "CATY", "CBOE", "CBSH", "CFFN", "CHFC", "CINF", "CME ", "COLB", "CVBF", "ERIE", "ESGR", "ETFC", "EWBC", "EZPW", "FCFS", "FCNC", "FFBC", "FFIN", "FITB", "FMBI", "FMER", "FNFG", "FNGN", "FSRV", "FULT", "GBCI", "GLPI", "GLRE", "HBAN", "HBHC", "HLSS", "HOMB", "IBKC", "IBKR", "IBOC", "IPCC", "ISBC", "KRNY", "LPLA", "MBFI", "MHLD", "MKTX", "MTGE", "NAVG", "NBTB", "NDAQ", "NFBK", "NPBC", "NTRS", "NWBI", "ORIT", "OZRK", "PACW", "PBCT", "PCH ", "PNFP", "PRAA", "PVTB", "ROIC", "SAFT", "SBNY", "SBRA", "SCBT", "SEIC", "SIGI", "SIVB", "SLM ", "STFC", "SUSQ", "TCBI", "TFSL", "TRMK", "TROW", "UBSI", "UMBF", "UMPQ", "VRTS", "WABC", "WAFD", "WETF", "WRLD", "WTFC", "Z", "ZION"]
for company in company_symbol:
url = 'http://finance.google.com/finance/info?client=ig&q={0}:{1}'.format(company, 'NASDAQ')
nasdaq = urllib.urlopen(url)
text = nasdaq.read()
filename = 'nasdaq.txt'.format(company)
with file(filename, 'a') as output:
output.write(str(text))
This code will was written as an example of one way to change urls and do something with each url.
If you need to scrape only predefined quotes for given period, then the logic is following:
Prepare the list of quotes you interested in e.g. ['ABC', 'XYZ', 'LOL', ...].
Use basic scrapy.Spider.
Define start_requests() method and yield a sequence of requests from it.
Sample implementation:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["ca.finance.yahoo.com"]
quotes = ["BMO", "CM", "BB"]
url_template = "https://ca.finance.yahoo.com/q/hp?s=%s.TO\
&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
def start_requests(self):
for quote in self.quotes:
url = self.url_template % quote
yield Request(url)
def parse(self, response):
# process
But if you need to get ALL TSX quotes data, then I would recommend you to scrape them from available listings and then use as in above example. Crawling the entire ca.finance.yahoo.com is obviously a bad idea.
If you have a list of stocks you want to load the yahoo page for, you can get a list of the yahoo urls like this:
url_template = "https://ca.finance.yahoo.com/q/hp?s={}.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
stocks = ['CM', 'BB']
urls = [url_template.format(stock) for stock in stocks]
I haven't used scrapy, though, so I'm not sure if this is what you need.

Scrape using multiple POST data from the same URL

I have already created one spider that collects a list of company names with matching phone numbers. This is then saved to a CSV file.
I am then wanting to scrape data from another site using the phones numbers in the CSV file as POST data. I am wanting it to loop through the same start URL but just scraping the data that each phone number produces until there are no more numbers left in the CSV file.
This is what I have got so far:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from scrapy import log
import sys
from scrapy.shell import inspect_response
from btw.items import BtwItem
import csv
class BtwSpider(BaseSpider):
name = "btw"
allowed_domains = ["siteToScrape.com"]
start_urls = ["http://www.siteToScrape.com/broadband/broadband_checker"]
def parse(self, response):
phoneNumbers = ['01253873647','01253776535','01142726749']
return [FormRequest.from_response(response,formdata={'broadband_checker[phone]': phoneNumbers[1]},callback=self.after_post)]
def after_post(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#id="results"]')
items = []
for site in sites:
item = BtwItem()
fttcText = site.select("div[#class='content']/div[#id='btfttc']/ul/li/text()").extract()
# Now we will change the text to be a boolean value
if fttcText[0].count('not') > 0:
fttcEnabled=0
else:
fttcEnabled=1
item['fttcAvailable'] = fttcEnabled
items.append(item)
return items
At the minute I have just been trying to get this looping through a list(phoneNumbers) but I have not even managed to get that to work so far. Once I know how to do that I will be able to get it to pull it from a CSV file by myself. In its current state it is just using the phoneNumber with a index of 1 in the list.
Assuming you have a phones.csv file with phones in it:
01253873647
01253776535
01142726749
Here's your spider:
import csv
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
class BtwItem(Item):
fttcAvailable = Field()
phoneNumber = Field()
class BtwSpider(BaseSpider):
name = "btw"
allowed_domains = ["samknows.com"]
def start_requests(self):
yield Request("http://www.samknows.com/broadband/broadband_checker", self.parse_main_page)
def parse_main_page(self, response):
with open('phones.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
phone_number = row[0]
yield FormRequest.from_response(response,
formdata={'broadband_checker[phone]': phone_number},
callback=self.after_post,
meta={'phone_number': phone_number})
def after_post(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#id="results"]')
phone_number = response.meta['phone_number']
for site in sites:
item = BtwItem()
fttc = site.select("div[#class='content']/div[#id='btfttc']/ul/li/text()").extract()
item['phoneNumber'] = phone_number
item['fttcAvailable'] = 'not' in fttc[0]
yield item
Here's what was scraped after running it:
{'fttcAvailable': False, 'phoneNumber': '01253873647'}
{'fttcAvailable': False, 'phoneNumber': '01253776535'}
{'fttcAvailable': True, 'phoneNumber': '01142726749'}
The idea is to scrape the main page using start_requests, then read the csv file line-by-line in the callback and yield new Requests for each phone number (csv row). Additionally, pass phone_number to the callback through the meta dictionary in order to write it to the Item field (I think you need this to distinguish items/results).
Hope that helps.

Categories

Resources