using regex on scrapy item loader - python

I'm trying to figure out how to use regex with scrapy item loaders.
I've tried to use a lambda function with split() and got the following error.
Split cannot be defined. You can see the function is commented out in the item loader class.
What I'm trying to do is remove all the text before the date including the "/"
of the date item. Date item being the url that I've just parsed
"https://www.sofascore.com/tennis/2018-02-07"
How do I use regex with scrapy item loaders?
Can I pass in the regex to the item loader or do I have to process it at the spider?
spider.py
import scrapy
from scrapy_splash import SplashRequest
from scrapejs.items import SofascoreItemLoader
from scrapy import Spider
import json
from scrapy.http import Request, FormRequest
class MySpider(scrapy.Spider):
name = "jsscraper"
start_urls = ["https://www.sofascore.com/tennis/2018-02-07"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait': 1.5})
def parse(self, response):
for row in response.css('.event-team'):
il = SofascoreItemLoader(selector=row)
il.add_css('winner' , '.event-team:nth-child(2)::text')
il.add_css('loser' , '.event-team:nth-child(1)::text')
il.add_value('date', response.url)
yield il.load_item()
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Split
from operator import methodcaller
from scrapy import Spider, Request, Selector
class SofascoreItem(scrapy.Item):
loser = scrapy.Field()
winner = scrapy.Field()
date = scrapy.Field()
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
#review_in = MapCompose(lambda x: x.split("/" , [-1]))

il.add_value('date', response.url, re='([^/]+)$')
See https://doc.scrapy.org/en/latest/topics/loaders.html for more details

Here is what is wrong with the code.
Apparently you do not have to 'feed' item loader with add_value but you won't get field populated in the end.
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.split("/")[-1])
You have to do split and then select last item in list that is generated by splitting. split(SEPARATOR, [-1]) is not what you want. Second argument is for selecting in how many parts you want string to be split.
Second, you want to add url value to review field, right?
This is not an answer how to use regex in scrapy ItemLoader, but you do not need it here. You just need to properly use split method.

Related

Scrapy only returning a single item from list

I believe that I have my xpaths coded in the incorrect way, as I only get a single result for each url. Whereas, there are in total 25 job posts for each url (not included those in the next page.) How can I correct my xpaths to get all the results?
Here's my scraper:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
callback = self.parse,
cb_kwargs = {
'items':items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]')
for lists in container:
loader = ItemLoader(CvItem(), selector = lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()
There was a slight mistake with the requests that I updated for those of you that had checked for the first 15minutes since I uploaded it.
The problem was in the container's xpath. You only get the container without actually the items in it so you only loop once on the container itself and not the actual items you want to scrape.
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
title = Field(output_processor=TakeFirst())
organisation = Field(output_processor=TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url=url,
cb_kwargs={
'items': items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]//li[#class="results__item"]')
for lists in container:
loader = ItemLoader(CvItem(), selector=lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()

How to scrape JSON web pages

Hey so I have some experience scraping html but never json and so I need to scrape the following web page using scrapy, http://www.starcitygames.com/buylist/search?search-type=category&id=5061, and I found a tutorial online that uses scrapy along with jmspath to scrape json data from the web. And I got the tutorial to work but I am trying to alter it to work with my website to no luck. No errors but it does not return any data. Any help would be greatly appreciated!
items.py
import scrapy
class NameItem(scrapy.Item):
"""User item definition for jsonplaceholder /LoginSpider endpoint."""
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
LoginSpider.py
import scrapy
import json
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from ..items import NameItem
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Join, MapCompose, SelectJmes
class UserSpider(scrapy.Spider):
"""Spider to scrape `http://www.starcitygames.com/buylist/search?search-type=category&id=5061`."""
name = 'LoginSpider'
allowed_domains = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
start_urls = ['http://www.starcitygames.com/buylist/search?search-type=category&id=5061']
# dictionary to map UserItem fields to Jmes query paths
jmes_paths = {
'name': 'name',
'condition': 'condition',
'price': 'price',
'rarity': 'rarity',
}
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
for user in jsonresponse:
loader = ItemLoader(item=NameItem()) # create an ItemLoader to populate a NameItem
loader.default_input_processor = MapCompose(str) # apply str conversion on each value
loader.default_output_processor = Join(' ')
for (field, path) in self.jmes_paths.items():
loader.add_value(field, SelectJmes(path)(user))
yield loader.load_item()
The response of this url http://www.starcitygames.com/buylist/search?search-type=category&id=5061has 3 levels:
'Ok'
'search'
'results' ## this contain the data
And results key has multiple values what you should iterate.
Inside the values are the data.
Try this code, I hope you can help.
This is the module items.py
class SoResponseItem(scrapy.Item):
name = scrapy.Field()
condition = scrapy.Field()
price = scrapy.Field()
rarity = scrapy.Field()
This is the spider
import scrapy
import json
from SO_response.items import SoResponseItem
class LoginspiderSpider(scrapy.Spider):
name = 'LoginSpider'
allowed_domains = ['www.starcitygames.com']
url = 'http://www.starcitygames.com/'
def start_requests(self):
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
url = response.urljoin('buylist/search?search-type=category&id=5061')
yield scrapy.Request(url=url, callback=self.parse_data)
def parse_data(self, response):
jsonreponse = json.loads(response.body)
for result in jsonreponse['results']:
for index in range(len(result)):
items = SoResponseItem()
items['name'] = result[index]['name']
items['condition'] = result[index]['condition']
items['price'] = result[index]['price']
items['rarity'] = result[index]['rarity']
yield items
Try in your shell:
scrapy crawl -o jmes.json

scrapy json output all items on one line

I'm trying to get my output to look like the following in json format.
{"loser": "De Schepper K." ,"winner": "Herbert P.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
But I'm currently getting individual lines for each loser item and winner item. I would like both winner and loser to be on the same line with the url.
{"loser": "De Schepper K.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
{"winner": "Herbert P.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
{"loser": "Sugita Y.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
I'm not sure if it's my selectors that's causing this behaviour but I'd like to know how I can customise the pipelines so the loser, winner and date are all on the same json line
I've never extracted json format before so it's new to me.
How do you specify what json keys and values will be on each line using custom pipeline?
I also tried to use csv item exporter to do this and got strange behaviour too. ref
Scrapy output is showing empty rows per column
Here's my spider.py
import scrapy
from scrapy_splash import SplashRequest
from scrapejs.items import SofascoreItemLoader
from scrapy import Spider
import json
from scrapy.http import Request, FormRequest
class MySpider(scrapy.Spider):
name = "jsscraper"
start_urls = ["https://www.sofascore.com/tennis/2018-02-07"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait': 1.5})
def parse(self, response):
for row in response.css('.event-team'):
il = SofascoreItemLoader(selector=row)
il.add_css('winner' , '.event-team:nth-
child(2)::text')
il.add_css('loser' , '.event-team:nth-
child(1)::text')
il.add_value('url', response.url)
yield il.load_item()
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose
from operator import methodcaller
from scrapy import Spider, Request, Selector
class SofascoreItem(scrapy.Item):
loser = scrapy.Field()
winner = scrapy.Field()
url = scrapy.Field()
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
pipeline.py
import json
import codecs
from collections import OrderedDict
class JsonPipeline(object):
def __init__(self):
self.file = codecs.open('data_utf8.json' , 'w' ,
encoding='utf-8')
def process_item(self , item , spider):
line = json.dumps(OrderedDict(item) , ensure_ascii=False ,
sort_keys=False) + "\n"
self.file.write(line)
return item
def close_spider(self , spider):
self.file.close()
The problem here is that you're looping over .event-team elements.
One of these elements can only be the winner or the loser, so you get an item for each.
What you should be doing instead is loop over elements containing both (.list-event seems like a good candidate), and extract both the winner and loser from those.
This way, you'd have one loop per event, and as a result, one item per event.
So I relooked your question, and I now find where the problem is:
for row in response.css('.event-team'):
With the above line, you will get many Selectors(or a SelectorList). However, in each Selector or row, you can only get one Field: winner or loser. You can't get them both.
That's why there will be empty rows in your output.
Solution: try the following line:
for row in response.css('div[class=“cell__section--main s-tennisCell curb-width”]')

Scrapy: How to populate an item with data from two websites

I'd like to collect an item's data from 2 different websites.
It should work as follows:
parse_website_1 fetches a persons name from website_1 and populates
the item
parse_website_1 yields a request for parse_website_2
parse_website_2 parses website_2, collects the persons hair-color, based on the person name which was scraped from website_1 and populates the item
parse_website_2 loads the item
Would this be in the right direction, given that the item is defined in items.py:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def __init__(self):
self.item = ItemLoader(item=MyItem(), response=response)
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
name = response.xpath('//div[#class="name"]/text()').extract_first()
self.item.add_value("name", name)
website_2_path = "http://website_2.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2)
def parse_website_2(self, response):
self.item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield self.item.load_item()
The idea is right, but the implementation is not correct in that you are trying to pass data between consecutive requests using an instance attribute (self.item). Scrapy requests are asynchronous so it would not work as expected.
The correct way how to do it is outlined in Scrapy FAQ. Pass the partial item data to consecutive request using Request's meta attribute where you obtain it using Response's meta attribute, add some more data and finally yield the item. Here's the adapted code:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
item = ItemLoader(item=MyItem(), response=response)
name = response.xpath('//div[#class="name"]/text()').extract_first()
item.add_value("name", name)
website_2_path = "http://website_1.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2, meta={'item': item})
def parse_website_2(self, response):
item = response.meta['item']
item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield item.load_item()

scrapy extract the url of image

how to fetch image url from website using scrapy in python.please help me.this is my code
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url= Field()
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['bambeeq.com']
start_urls = ['http://www.bambeeq.com/']
rules = (Rule(LinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
item = MyItem()
item['url'] = []
for link in LinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item['url'].append(link.url)
#item['image'].append(link.img)
return item
You are extracting the links ('a' element), not the images ('img' element). Try this:
# iterate over the list of images
for image in response.xpath('//img/#src').extract():
# make each one into a full URL and add to item[]
item['url'].append(response.urljoin(image))
yield item

Categories

Resources