I'm trying to figure out how to use regex with scrapy item loaders.
I've tried to use a lambda function with split() and got the following error.
Split cannot be defined. You can see the function is commented out in the item loader class.
What I'm trying to do is remove all the text before the date including the "/"
of the date item. Date item being the url that I've just parsed
"https://www.sofascore.com/tennis/2018-02-07"
How do I use regex with scrapy item loaders?
Can I pass in the regex to the item loader or do I have to process it at the spider?
spider.py
import scrapy
from scrapy_splash import SplashRequest
from scrapejs.items import SofascoreItemLoader
from scrapy import Spider
import json
from scrapy.http import Request, FormRequest
class MySpider(scrapy.Spider):
name = "jsscraper"
start_urls = ["https://www.sofascore.com/tennis/2018-02-07"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait': 1.5})
def parse(self, response):
for row in response.css('.event-team'):
il = SofascoreItemLoader(selector=row)
il.add_css('winner' , '.event-team:nth-child(2)::text')
il.add_css('loser' , '.event-team:nth-child(1)::text')
il.add_value('date', response.url)
yield il.load_item()
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Split
from operator import methodcaller
from scrapy import Spider, Request, Selector
class SofascoreItem(scrapy.Item):
loser = scrapy.Field()
winner = scrapy.Field()
date = scrapy.Field()
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
#review_in = MapCompose(lambda x: x.split("/" , [-1]))
il.add_value('date', response.url, re='([^/]+)$')
See https://doc.scrapy.org/en/latest/topics/loaders.html for more details
Here is what is wrong with the code.
Apparently you do not have to 'feed' item loader with add_value but you won't get field populated in the end.
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.split("/")[-1])
You have to do split and then select last item in list that is generated by splitting. split(SEPARATOR, [-1]) is not what you want. Second argument is for selecting in how many parts you want string to be split.
Second, you want to add url value to review field, right?
This is not an answer how to use regex in scrapy ItemLoader, but you do not need it here. You just need to properly use split method.
I am trying to scrape yahoo stocks for a school project, but I have no idea how to go through each link of a page with a very certain link. The goal is to iterate through each stock with a certain ending portion of the url like so:
Starting URL = ["https://ca.finance.yahoo.com/q/hp?s=BMO.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"]
The next URL would be something like:
#Canadian Imperial(note the "CM"):
"https://ca.finance.yahoo.com/q/hp?s=CM.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
#Blackberry (note the "BB"):
"https://ca.finance.yahoo.com/q/hp?s=BB.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
etc...
In other words, the only thing that would change would be the characters between "hp?s=" and ".TO&a".
Wondering if this is possible or not. The ending portion of the URL must stay the same as that is the page I need to get to. Unfortuantely, there is no links within each page on yahoo to go to other stocks.
If I could do this with Scrapy's Rules and SmglLinkExtractor, that would be preferable.
Would appreciate any help!
Thanks!
Current Scrapy code:
from scrapy.spider import Spider
from scrapy.selector import Selector
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["ca.finance.yahoo.com"]
start_urls = [
"https://ca.finance.yahoo.com/q/hp?s=BMO.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
]
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"), follow=True)
]
def parse(self, response):
item = Website()
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
print item['name']
Make a rule to follow the links matching the pattern:
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.\w+&a=\d+&b=\d+&c=\d+&d=\d+&e=\d+&f=\d+&g=m"), follow=True)
]
Though, I am not sure that you need to check for all URL parameters here. Simplified version:
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.\w+"), follow=True)
]
And, don't forget the imports:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
Here's an example of what I was talking about in the comments i left.
import urllib
import os
company_symbol = ["ACGL", "AFSI", "AGII", "AGNC", "ANAT", "ARCP", "ASBC", "ASPS", "BANF", "BBCN", "BGCP", "BNCL", "BOKF", "BPOP", "BRKL", "CACC", "CATY", "CBOE", "CBSH", "CFFN", "CHFC", "CINF", "CME ", "COLB", "CVBF", "ERIE", "ESGR", "ETFC", "EWBC", "EZPW", "FCFS", "FCNC", "FFBC", "FFIN", "FITB", "FMBI", "FMER", "FNFG", "FNGN", "FSRV", "FULT", "GBCI", "GLPI", "GLRE", "HBAN", "HBHC", "HLSS", "HOMB", "IBKC", "IBKR", "IBOC", "IPCC", "ISBC", "KRNY", "LPLA", "MBFI", "MHLD", "MKTX", "MTGE", "NAVG", "NBTB", "NDAQ", "NFBK", "NPBC", "NTRS", "NWBI", "ORIT", "OZRK", "PACW", "PBCT", "PCH ", "PNFP", "PRAA", "PVTB", "ROIC", "SAFT", "SBNY", "SBRA", "SCBT", "SEIC", "SIGI", "SIVB", "SLM ", "STFC", "SUSQ", "TCBI", "TFSL", "TRMK", "TROW", "UBSI", "UMBF", "UMPQ", "VRTS", "WABC", "WAFD", "WETF", "WRLD", "WTFC", "Z", "ZION"]
for company in company_symbol:
url = 'http://finance.google.com/finance/info?client=ig&q={0}:{1}'.format(company, 'NASDAQ')
nasdaq = urllib.urlopen(url)
text = nasdaq.read()
filename = 'nasdaq.txt'.format(company)
with file(filename, 'a') as output:
output.write(str(text))
This code will was written as an example of one way to change urls and do something with each url.
If you need to scrape only predefined quotes for given period, then the logic is following:
Prepare the list of quotes you interested in e.g. ['ABC', 'XYZ', 'LOL', ...].
Use basic scrapy.Spider.
Define start_requests() method and yield a sequence of requests from it.
Sample implementation:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["ca.finance.yahoo.com"]
quotes = ["BMO", "CM", "BB"]
url_template = "https://ca.finance.yahoo.com/q/hp?s=%s.TO\
&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
def start_requests(self):
for quote in self.quotes:
url = self.url_template % quote
yield Request(url)
def parse(self, response):
# process
But if you need to get ALL TSX quotes data, then I would recommend you to scrape them from available listings and then use as in above example. Crawling the entire ca.finance.yahoo.com is obviously a bad idea.
If you have a list of stocks you want to load the yahoo page for, you can get a list of the yahoo urls like this:
url_template = "https://ca.finance.yahoo.com/q/hp?s={}.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
stocks = ['CM', 'BB']
urls = [url_template.format(stock) for stock in stocks]
I haven't used scrapy, though, so I'm not sure if this is what you need.
im new with Scrapy and web crawling and I've been working on the page www.mercadolibre.com.mx I have to get (from the startpage) some data (descripton and prices) about the produtcs displayed in there. Here is my items.py:
from scrapy.item import Item, Field
class PruebaMercadolibreItem(Item):
producto = Field()
precio = Field()
And here is my spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(BaseSpider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[#class='item-data']")
items = []
for titles in titles:
item = PruebaMercadolibreItem()
item["producto"] = titles.select("p[#class='tit le']/#title").extract()
item["precio"] = titles.select("span[#class='ch-price']/text()").extract()
items.append(item)
return items
The problem is that I get the same results in when I change this line:
titles = hxs.select("//div[#class='item-data']")
To this:
titles = hxs.select("//div[#class='item-data'] | //div[#class='item-data item-data-mp']")
And Im not getting the same data as when I use the first line.
Can anyone help me? do I have any errorin my xPath selection?
Also I cant find a good tutorial for using MySQL with scrapy, I would appreciate any help. Thx
Better use contains if you want to get all div tags containing item-data class:
titles = hxs.select("//div[contains(#class, 'item-data')]")
Also, you have other problems in the spider:
the loop, you are overriding the titles
class name in producto xpath should be title, not tit le
you probably don't want to have lists in Field values, get the first items out of the extracted lists
HtmlXPathSelector is deprecated, use Selector instead
select() is deprecated, use xpath() instead
BaseSpider has been renamed to Spider
Here's the code with modifications:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.item import Item, Field
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(Spider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[contains(#class, 'item-data')]")
for title in titles:
item = PruebaMercadolibreItem()
item["producto"] = title.xpath("p[#class='title']/#title").extract()[0]
item["precio"] = title.xpath("span[#class='ch-price']/text()").extract()[0]
yield item
Example items from the output:
{'precio': u'$ 35,000', 'producto': u'Cuatrimoto, Utv De 500cc 4x4 ,moto , Motos, Atv ,'}
{'precio': u'$ 695', 'producto': u'Reloj Esp\xeda Camara Oculta Video Hd 16 Gb! Sony Compara.'}
I want to extract the contents like side-effects, warning, dosage from the site mentioned in the start urls. The following is my code. The csv file is getting created but nothing is displayed. The output is:
before for
[] # it is displaying empty list
after for
This is my code:
from scrapy.selector import Selector
from medicinelist_sample.items import MedicinelistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class MedSpider(CrawlSpider):
name = "med"
allowed_domains = ["medindia.net"]
start_urls = ["http://www.medindia.net/doctors/drug_information/home.asp?alpha=z"]
rules = [Rule(SgmlLinkExtractor(allow=('Zafirlukast.htm',)), callback="parse", follow = True),]
global Selector
def parse(self, response):
hxs = Selector(response)
fullDesc = hxs.xpath('//div[#class="report-content"]//b/text()')
final = fullDesc.extract()
print "before for" # this is just to see if it was printing
print final
print "after for" # this is just to see if it was printing
Your scrapy spider class's parse method should return item(s). With the current code, I do not see any item being returned. An example would be,
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
sel = Selector(response)
item = Item()
item['id'] = sel.xpath('//td[#id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] = sel.xpath('//td[#id="item_name"]/text()').extract()
item['description'] = sel.xpath('//td[#id="item_description"]/text()').extract()
return item
For more information, take a look at the CrawlSpider example in the official scrapy docs.
Another problem in your code is that you are overriding the CrawlSpider's parse method to implement callback logic. This mustn't be done with CrawlSpiders since the parse method is used in its logic.
Ashish Nitin Patil has implicitly noted that already by naming his example function *parse_item*.
What the default implementation of a Crawl Spider's parse method basically does is to call the callbacks, that you've specified in the rule definitions; so if you override it, I think your callbacks won't be called at all. See Scrapy Doc - crawling rules
I just have experimented a bit with the site that you are crawling. As you would like to extract some data about the medicine (like the name, indications, contraindications, etc.) out the different sites on this domain: Wouldn't the following or similar XPath expressions fit your needs? I think your current query would give you just the "headers", but the actual info on this site is in the textnodes that follow those bold-rendered headers.
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from Test.items import TestItem
from scrapy.item import Item, Field
class Medicine(Item):
name = Field()
dosage = Field()
indications = Field()
contraindications = Field()
warnings = Field()
class TestmedSpider(CrawlSpider):
name = 'testmed'
allowed_domains = ['http://www.medindia.net/doctors/drug_information/']
start_urls = ['http://www.http://www.medindia.net/doctors/drug_information/']
rules = (
Rule(SgmlLinkExtractor(allow=r'Zafirlukast.htm'), callback='parse_item', follow=True),
)
def parse_item(self, response):
drug_info = Medicine()
selector = Selector(response)
name = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Generic Name')]//..//following-sibling::text()[1])''')
dosage = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Dosage')]//..//following-sibling::text()[1])''')
indications = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Why it is prescribed (Indications)')]//..//following-sibling::text()[1])''')
contraindications = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Contraindications')]//..//following-sibling::text()[1])''')
warnings = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Warnings and Precautions')]//..//following-sibling::text()[1])''')
drug_info['name'] = name.extract()
drug_info['dosage'] = dosage.extract()
drug_info['indications'] = indications.extract()
drug_info['contraindications'] = contraindications.extract()
drug_info['warnings'] = warnings.extract()
return drug_info
This would give you the following infos:
>scrapy parse --spider=testmed --verbose -d 2 -c parse_item --logfile C:\Python27\Scripts\Test\Test\spiders\test.log http://www.medindia.net/doctors/drug_information/Zafirlukast.htm
>>> DEPTH LEVEL: 1 <<<
# Scraped Items ------------------------------------------------------------
[{'contraindications': [u'Hypersensitivity.'],
'dosage': [u'Adult- The recommended dose is 20 mg twice daily.'],
'indications': [u'This medication is an oral leukotriene receptor antagonist (
LTRA), prescribed for asthma. \xa0It blocks the action of certain natural substa
nces that cause swelling and tightening of the airways.'],
'name': [u'\xa0Zafirlukast'],
'warnings': [u'Caution should be exercised in patients with history of liver d
isease, mental problems, suicidal thoughts, any allergy, elderly, during pregnan
cy and breastfeeding.']}]
I'm currently trying to run the following code but it keeps scraping only the first result of each page. Any idea what the issue may be?
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123Item
import urlparse
from scrapy.http.request import Request
class MySpider(CrawlSpider):
name = "xyz123"
allowed_domains = ["www.xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/",]
rules = (Rule (SgmlLinkExtractor(allow=("",),restrict_xpaths=('//*[#id="1234headerPagination_hlNextLink"]',))
, callback="parse_xyz", follow=True),
)
def parse_xyz(self, response):
hxs = HtmlXPathSelector(response)
xyz = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
The Basespider version works well scraping ALL the required data on the first page:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from firstproject.items import xyz123
class MySpider(BaseSpider):
name = "xyz123test"
allowed_domains = ["xyz123.com.au"]
start_urls = ["http://www.xyz123.com.au/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//div[#id="1234SearchResults"]//div/h2')
items = []
for titles in titles:
item = xyz123Item()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
Sorry for the censoring. I had to censor the website for privacy reasons.
The first code crawls through the pages well the way I'd like it to crawl, however it only pulls the first item title and link. NOTE: The XPath of the first title using "inspect element" in google is:
//*[#id="xyz123SearchResults"]/div[1]/h2/a,
second is //*[#id="xyz123SearchResults"]/div[2]/h2/a
third is //*[#id="xyz123SearchResults"]/div[3]/h2/a etc.
I'm not sure if the div[n] bit is what's killing it. I'm hoping it's an easy fix.
Thanks
for xyz in xyz:
item = xyz123Item()
item ["title"] = xyz.select('a/text()').extract()[0]
item ["link"] = xyz.select('a/#href').extract()[0]
items.append(item)
return items
Are you sure about the indentation of the return items ? It should be one less.