I'm having a problem iterating a crawl using scrapy. I am extracting a title field and a content field. The problem is that I get a JSON file with all of the titles listed and then all of the content. I'd like to get {title}, {content}, {title}, {content}, meaning I probably have to iterate through the parse function. The problem is that I cannot figure out what element I am looping through (i.e., for x in [???]) Here is the code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import SitemapSpider
from Foo.items import FooItem
class FooSpider(SitemapSpider):
name = "foo"
sitemap_urls = ['http://www.foo.com/sitemap.xml']
#sitemap_rules = [
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = [
item = FooItem()
item['title'] = hxs.select('//span[#class="headline"]/text()').extract()
item['content'] = hxs.select('//div[#class="articletext"]/text()').extract()
items.append(item)
return items
Your xpath queries returns all titles and all contents on the page. I suppose you can do:
titles = hxs.select('//span[#class="headline"]/text()').extract()
contents = hxs.select('//div[#class="articletext"]/text()').extract()
for title, context in zip(titles, contents):
item = FooItem()
item['title'] = title
item['content'] = context
yield item
But it is not reliable. Try to perform xpath query that return block with title and content inside. If you showed me xml source I'd help you.
blocks = hxs.select('//div[#class="some_filter"]')
for block in blocks:
item = FooItem()
item['title'] = block.select('span[#class="headline"]/text()').extract()
item['content'] = block.select('div[#class="articletext"]/text()').extract()
yield item
I'm not sure about xpath queries but I think idea is clear.
You don't need HtmlXPathSelector. Scrapy already has built-in XPATH selector. Try this:
blocks = response.xpath('//div[#class="some_filter"]')
for block in blocks:
item = FooItem()
item['title'] = block.xpath('span[#class="headline"]/text()').extract()[0]
item['content'] = block.xpath('div[#class="articletext"]/text()').extract()[0]
yield item
Related
I am newbie using xpath,
I wanna extract every single title, body, link , release date from this link
everthing seems okay, but no on body, how to extract every single body on nested xPath, thanks before :)
here my source
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from thehack.items import ThehackItem
class MySpider(BaseSpider):
name = "thehack"
allowed_domains = ["thehackernews.com"]
start_urls = ["http://thehackernews.com/search/label/mobile%20hacking"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//article[#class="post item module"]')
items = []
for titles in titles:
item = ThehackItem()
item['title'] = titles.select('span/h2/a/text()').extract()
item['link'] = titles.select('span/h2/a/#href').extract()
item['body'] = titles.select('span/div/div/div/div/a/div/text()').extract()
item['date'] = titles.select('span/div/span/text()').extract()
items.append(item)
return items
anybody can fix about body blok? only on body...
thanks before mate
here the picture of inspection elements from the website
I think you where struggling with the selectors, right? I think you should check the documentation for selectors, there's a lot of good information there. In this particular example, using the css selectors, I think it would be something like:
class MySpider(scrapy.Spider):
name = "thehack"
allowed_domains = ["thehackernews.com"]
start_urls = ["http://thehackernews.com/search/label/mobile%20hacking"]
def parse(self, response):
for article in response.css('article.post'):
item = ThehackItem()
item['title'] = article.css('.post-title>a::text').extract_first()
item['link'] = article.css('.post-title>a::attr(href)').extract_first()
item['body'] = ''. join(article.css('[id^=summary] *::text').extract()).strip()
item['date'] = article.css('[itemprop="datePublished"]::attr(content)').extract_first()
yield item
It would be a good exercise for you to change them to xpath selectors and maybe also check about ItemLoaders, together are very useful.
I've build my 1st Scrapy project but can't figure out the last hurdle.
With my script below I get one long list in csv. First all the Product Prices and than all the Product Names.
What I would like to achieve is that for every Product the price is next to in.
For example:
Product Name, Product Price
Product Name, Product Price
My scrapy project:
Items.py
from scrapy.item import Item, Field
class PrijsvergelijkingItem(Item):
Product_ref = Field()
Product_price = Field()
My Spider called nvdb.py:
from scrapy.spider import BaseSpider
import scrapy.selector
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(BaseSpider):
name = "nvdb"
allowed_domains = ["vandenborre.be"]
start_urls = ["http://www.vandenborre.be/tv-lcd-led/lcd-led-tv-80-cm-alle-producten"]
def parse(self, response):
hxs = scrapy.Selector(response)
titles = hxs.xpath("//ul[#id='prodlist_ul']")
items = []
for titles in titles:
item = PrijsvergelijkingItem()
item["Product_ref"] = titles.xpath("//div[#class='prod_naam']//text()[2]").extract()
item["Product_price"] = titles.xpath("//div[#class='prijs']//text()[2]").extract()
items.append(item)
return items
You need to switch your XPath expressions to work in the context of every "product". In order to do this, you need to prepend a dot to the expressions:
def parse(self, response):
products = response.xpath("//ul[#id='prodlist_ul']/li")
for product in products:
item = PrijsvergelijkingItem()
item["Product_ref"] = product.xpath(".//div[#class='prod_naam']//text()[2]").extract_first()
item["Product_price"] = product.xpath(".//div[#class='prijs']//text()[2]").extract_first()
yield item
I've also improved the code a little bit:
I assume you meant to iterate over list items ul->li and not just ul - fixed the expression
used the response.xpath() shortcut method
used extract_first() instead of extract()
improved the variable naming
used yield instead of collecting items in a list and then returning
I am not sure if this can help you, but you can use OrderedDict from collections for your need.
from scrapy.spider import BaseSpider
import scrapy.selector
from collections import OrderedDict
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(BaseSpider):
name = "nvdb"
allowed_domains = ["vandenborre.be"]
start_urls = ["http://www.vandenborre.be/tv-lcd-led/lcd-led-tv-80-cm-alle-producten"]
def parse(self, response):
hxs = scrapy.Selector(response)
titles = hxs.xpath("//ul[#id='prodlist_ul']")
items = []
for titles in titles:
item = OrderedDict(PrijsvergelijkingItem())
item["Product_ref"] = titles.xpath("//div[#class='prod_naam']//text()[2]").extract()
item["Product_price"] = titles.xpath("//div[#class='prijs']//text()[2]").extract()
items.append(item)
return items
Also you might have to change the way you iterate dict,
for od in items:
for key,value in od.items():
print key,value
Very new to scrapy, so bear with me.
First, here is my code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from usdirectory.items import UsdirectoryItem
from scrapy.http import Request
class MySpider(BaseSpider):
name = "usdirectory"
allowed_domains = ["domain.com"]
start_urls = ["url_removed_sorry"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//*[#id="holder_result2"]/a[1]/span/span[1]/text()').extract()
for title in titles:
item = UsdirectoryItem()
item["title"] = title
item
yield item
That works...but it only grabs the first item.
I noticed in the items I am trying to scrape, the Xpath changes for each row. For example, the first row is the xpath you see above:
//*[#id="holder_result2"]/a[1]/span/span[1]/text()
then it increments by 2, all the way to 29. So the second result:
//*[#id="holder_result2"]/a[3]/span/span[1]/text()
Last result:
//*[#id="holder_result2"]/a[29]/span/span[1]/text()
So my question is how do I get the script to grab all those, and I don't care if i have to copy and paste code for every item. All the other pages are exactly the same. I'm just not sure how to go about it.
Thank you very much.
Edit:
import scrapy
from scrapy.item import Item, Field
class UsdirectoryItem(scrapy.Item):
title = scrapy.Field()
Given the pattern is exactly as you described, you can use XPath modulo operator mod on position index of a to get all the target a elements :
//*[#id="holder_result2"]/a[position() mod 2 = 1]/span/span[1]/text()
For a quick demo, consider the following input XML :
<div>
<a>1</a>
<a>2</a>
<a>3</a>
<a>4</a>
<a>5</a>
</div>
Given this XPath /div/a[position() mod 2 = 1], the following elements will be returned :
<a>1</a>
<a>3</a>
<a>5</a>
See live demo in xpathtester.com here
Let me know if this works for you. Notice we are iterating over a[i] instead of a[1]. The results are stored in a list (hopefully).
def parse(self, response):
hxs = HtmlXPathSelector(response)
for i in xrange(15):
titles = hxs.select('//*[#id="holder_result2"]/a[' + str(1+i*2) + ']/span/span[1]/text()').extract()
for title in titles:
item = UsdirectoryItem()
item["title"] = title
item #erroneous line?
items.append(item)
yield item
I am using scrapy 1.0.3. Here is my code of spider file,
from scrapy import Spider
from scrapy.selector import Selector
from parser_xxx.items import XxxItem
class XxxSpider(Spider):
name = "xxx"
allowed_domains = ["xxx.xxx.com"]
start_urls = ["http://xxx.xxx.com/jobs/"]
def parse(self, response):
quelist = Selector(response).xpath('//div[#id="job_listings"]')
for que in quelist:
item = XxxItem()
item['title'] = que.xpath('//a//h4/text()').extract()
item['link'] = que.xpath('//a/#href').extract()
yield item
But, I am getting all anchor links and all titles. Where am I wrong?
Thanks in advance!
You have to make your XPath expressions context-specific by prepending a dot. Plus, I think you should iterate over the links inside the div with id="job_listings":
quelist = response.xpath('//div[#id="job_listings"]//a')
for que in quelist:
item = XxxItem()
item['title'] = que.xpath('.//h4/text()').extract()
item['link'] = que.xpath('#href').extract()
yield item
I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()