Is this Sprite SVG possible to scrape? - python

Can I scrape this with standard Scrapy or do I need to use Selenium?
The html is:
<td class="example"><sprite-svg name="EXAMPLE2"><svg><use
xlink:href="/spritemap/1_0_30#sprite-EXAMPLE2"></use></svg></sprite-svg></td>
I need the value "EXAMPLE2" somehow.
The xpath which works in the browser is //td[#class='example']//*[local-name() = 'svg']
When I put it into scrapy I use the following code but am getting XPATH error.
'example' : div.xpath(".//td[#class='example']//*[local-name() = 'svg']
()").extract()
Any ideas how to scrape it?

Looking at the table, each svg sprite is under a class 'rug_X'
Something like
import scrapy
class RaceSpider(scrapy.Spider):
name = 'race'
allowed_domains = ['thedogs.com.au']
start_urls = ['https://www.thedogs.com.au/racing/gawler/2020-07-07/1/the-bunyip-maiden-stake-pr2-division1']
item = {}
def parse(self, response):
row = response.xpath('//tbody/tr')
dog = a.xpath('.//td[#class="table__cell--tight race-runners__name"]/div/a/text()').get()
number = a.xpath('.//td[#class="table__cell--tight race-runners__box"]/sprite-svg/#name').get()
cleaned_num = int(number.replace('rug_',''))
grade = a.xpath('.//td[#class="race-runners__grade"]/text()').get()
item = {'grade':grade, 'greyhound':dog,'rug':cleaned_num}
yield item
You could also use item loaders with a custom function to clean up the response you get.

Yes. You can do it with scrapy :
response.xpath("//td[#class='table__cell--tight race-runners__box']/sprite-svg/#name").getall()
Working scrapy code :
import scrapy
class Test(scrapy.Spider):
name = 'Test'
start_urls = [
'https://www.thedogs.com.au/racing/gawler/2020-07-07/1/the-bunyip-maiden-stake-pr2-division1']
def parse(self, response):
return {"nameList": response.xpath("//td[#class='table__cell--tight race-runners__box']/sprite-svg/#name").getall()}

Related

IMDB Movie Scraping gives blank csv using scrapy

I am getting Blank csv, though its not showing any error in code.
It is unable to crawl through web page.
This is the code which I have written referring youtube:-
import scrapy
from Example.items import MovieItem
class ThirdSpider(scrapy.Spider):
name = "imdbtestspider"
allowed_domains = ["imdb.com"]
start_url = ('http://www.imdb.com/chart/top',)
def parse(self,response):
links = response.xpath('//tbody[#class="lister-list"]/tr/td[#class="titleColumn"]/a/#href').extract()
i = 1
for link in links:
abs_url = response.urljoin(link)
#
url_next = '//*[#id="main"]/div/span/div/div/div[2]/table/tbody/tr['+str(i)+']/td[3]/strong/text()'
rating = response.xpath(url_next).extact()
if (i <= len(link)):
i=i+1
yield scrapy.Request(abs_url, callback = self.parse_indetail, meta = {'rating': rating})
def parse_indetail(self,response):
item = MovieItem()
#
item['title'] = response.xpath('//div[#class="title_wrapper"])/h1/text()').extract[0][:-1]
item['directors'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="director"]/a/span/text()').extract()[0]
item['writers'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="creator"]/a/span/text()').extract()
item['stars'] = response.xpath('//div[#class="credit_summary_items"]/span[#itemprop="actors"]/a/span/text()').extract()
item['popularity'] = response.xpath('//div[#class="titleReviewBarSubItem"]/div/span/text()').extract()[2][21:-8]
return item
This is output I am getting while running executing code with
scrapy crawl imdbtestspider -o example.csv -t csv
2019-01-17 18:44:34 [scrapy.core.engine] INFO: Spider opened
2019-01-17 18:44:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages
(at 0 pag es/min), scraped 0 items (at 0 items/min)
This is another way you might give a try with. I used css selector instead of xpath to make the script less verbose.
import scrapy
class ImbdsdpyderSpider(scrapy.Spider):
name = 'imbdspider'
start_urls = ['http://www.imdb.com/chart/top']
def parse(self, response):
for link in response.css(".titleColumn a[href^='/title/']::attr(href)").extract():
yield scrapy.Request(response.urljoin(link),callback=self.get_info)
def get_info(self, response):
item = {}
title = response.css(".title_wrapper h1::text").extract_first()
item['title'] = ' '.join(title.split()) if title else None
item['directors'] = response.css(".credit_summary_item h4:contains('Director') ~ a::text").extract()
item['writers'] = response.css(".credit_summary_item h4:contains('Writer') ~ a::text").extract()
item['stars'] = response.css(".credit_summary_item h4:contains('Stars') ~ a::text").extract()
popularity = response.css(".titleReviewBarSubItem:contains('Popularity') .subText::text").extract_first()
item['popularity'] = ' '.join(popularity.split()).strip("(") if popularity else None
item['rating'] = response.css(".ratingValue span::text").extract_first()
yield item
I have tested you given xpaths i don't know they are mistakenly wrong or are actually wrong.
e.g;
xpath = //*="main"]/div/span/div/div/div[2]/table/tbody/tr['+str(i)+']/td[3]/strong/text()
#There is not table when you reach at div[2]
//div[#class="title_wrapper"])/h1/text() #here there is and error after `]` ) is bad syntax
Plus your xpaths are not yielding any results.
As to why you are getting the error that says 0/pages crawled, despite not recreating your case, I have to assume that your method of page iteration is not building the page URLs correctly.
I'm having trouble understanding the use for creating the variable array of all the "follow links" and then using len to send them to the parse_indetail() but a couple things to note.
When your using "meta" to pass items from one function to the next, though you have the right idea, you are missing some the instantiation to the function your passing it to (you should also be using a standard naming convention for simplicity)
Should be something like this...
def parse(self,response):
# If you are going to capture an item at the first request, you must instantiate
# your items class
item = MovieItem()
....
# You seem to want to pass ratings to the next function for itimization, so
# you make sure that you have it listed in your items.py file and you set it
item[rating] = response.xpath(PATH).extact() # Why did you ad the url_next? huh?
....
# Standard convention for passing meta using call back is like this, this way
# allows you to pass multiple itemized item gets passed
yield scrapy.Request(abs_url, callback = self.parse_indetail, meta = {'item': item})
def parse_indetail(self,response):
# Then you must initialize the meta again in the function your passing it to
item = response.meta['item']
# Then you can continue your scraping
You should not complicate the page iteration logic. You seem to get how it works but need help fine tuning this aspect. I have recreated you use case and optimized it.
#items.py file
import scrapy
class TestimbdItem(scrapy.Item):
title = scrapy.Field()
directors = scrapy.Field()
writers = scrapy.Field()
stars = scrapy.Field()
popularity = scrapy.Field()
rating = scrapy.Field()
# The spider file
import scrapy
from testimbd.items import TestimbdItem
class ImbdsdpyderSpider(scrapy.Spider):
name = 'imbdsdpyder'
allowed_domains = ['imdb.com']
start_urls = ['http://www.imdb.com/chart/top']
def parse(self, response):
for href in response.css("td.titleColumn a::attr(href)").extract():
yield scrapy.Request(response.urljoin(href),
callback=self.parse_movie)
def parse_movie(self, response):
item = TestimbdItem()
item['title'] = [ x.replace('\xa0', '') for x in response.css(".title_wrapper h1::text").extract()][0]
item['directors'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Director")]/following-sibling::a/text()').extract()
item['writers'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Writers")]/following-sibling::a/text()').extract()
item['stars'] = response.xpath('//div[#class="credit_summary_item"]/h4[contains(., "Stars")]/following-sibling::a/text()').extract()
item['popularity'] = response.css(".titleReviewBarSubItem span.subText::text")[2].re('([0-9]+)')
item['rating'] = response.css(".ratingValue span::text").extract_first()
yield item
Notice two things:
Id the parse() function. All I'm doing here is using a for loop through the links, each instance in loop referred to href, and pass the urljoined href to the parser function. Give your use case, this is more than enough. In a situation where you have the next page, it's just creating a variable for the "next" page somehow and callback to parse, it will keep doing that till it cant fint a "next" page.
Secondly, Use xpath only when in the HTML items have the same tagwith different content. This is more of a personal opinion but I tell people that xpath selectors is like scalpel and css selectors is like a butcher knife. You can get damn accurate with scalpel but it takes more time and in many cases may be just easier to go with CSS selector to get the same result.

extract data from nested xpath

I am newbie using xpath,
I wanna extract every single title, body, link , release date from this link
everthing seems okay, but no on body, how to extract every single body on nested xPath, thanks before :)
here my source
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from thehack.items import ThehackItem
class MySpider(BaseSpider):
name = "thehack"
allowed_domains = ["thehackernews.com"]
start_urls = ["http://thehackernews.com/search/label/mobile%20hacking"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//article[#class="post item module"]')
items = []
for titles in titles:
item = ThehackItem()
item['title'] = titles.select('span/h2/a/text()').extract()
item['link'] = titles.select('span/h2/a/#href').extract()
item['body'] = titles.select('span/div/div/div/div/a/div/text()').extract()
item['date'] = titles.select('span/div/span/text()').extract()
items.append(item)
return items
anybody can fix about body blok? only on body...
thanks before mate
here the picture of inspection elements from the website
I think you where struggling with the selectors, right? I think you should check the documentation for selectors, there's a lot of good information there. In this particular example, using the css selectors, I think it would be something like:
class MySpider(scrapy.Spider):
name = "thehack"
allowed_domains = ["thehackernews.com"]
start_urls = ["http://thehackernews.com/search/label/mobile%20hacking"]
def parse(self, response):
for article in response.css('article.post'):
item = ThehackItem()
item['title'] = article.css('.post-title>a::text').extract_first()
item['link'] = article.css('.post-title>a::attr(href)').extract_first()
item['body'] = ''. join(article.css('[id^=summary] *::text').extract()).strip()
item['date'] = article.css('[itemprop="datePublished"]::attr(content)').extract_first()
yield item
It would be a good exercise for you to change them to xpath selectors and maybe also check about ItemLoaders, together are very useful.

Scrapy only returning html, without content

I am trying to get the response...
<span id="BusinessDbaName" class="dataItem">TECHNO COATINGS INC</span>
scrapy is instead returning...
******name = [u'<span id="BusinessDbaName" class="dataItem"></span>']
i.e. I am having the html returned but not the content within the tags.
Question: What would cause this, and how do I fix it?
Here is my source code:
import scrapy
class lniSpider(scrapy.Spider):
name = "lni"
allowed_domains = ["secure.lni.wa.gov"]
start_urls = [
"https://secure.lni.wa.gov/verify/Detail.aspx?UBI=602123234&SAW="
]
def parse(self, response):
for sel in response.xpath('//body'):
name = sel.xpath('//*[#id="BusinessDbaName"]').extract()
print ("******name = "), name

ScraPy spider crawling but not exporting

I have a ScraPy Code that is running in shell, but when I try to export it to csv, it returns an empty file. It exports data when I do not go into a link and try to parse the description, but once I add the extra method of parsing the contents, it fails to work. Here is the code:
class MonsterSpider(CrawlSpider):
name = "monster"
allowed_domains = ["jobs.monster.com"]
base_url = "http://jobs.monster.com/v-technology.aspx?"
start_urls = [
"http://jobs.monster.com/v-technology.aspx"
]
for i in range(1,5):
start_urls.append(base_url + "page=" + str(i))
rules = (Rule(SgmlLinkExtractor(allow=("jobs.monster.com",))
, callback = 'parse_items'),)
def parse_items(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="col-xs-12"]')
#items = []
for site in sites.xpath('.//article[#class="js_result_row"]'):
item = MonsterItem()
item['title'] = site.xpath('.//span[#itemprop = "title"]/text()').extract()
item['company'] = site.xpath('.//span[#itemprop = "name"]/text()').extract()
item['city'] = site.xpath('.//span[#itemprop = "addressLocality"]/text()').extract()
item['state'] = site.xpath('.//span[#itemprop = "addressRegion"]/text()').extract()
item['link'] = site.xpath('.//a[#data-m_impr_a_placement_id= "jsr"]/#href').extract()
follow = ''.join(item["link"])
request = Request(follow, callback = self.parse_dir_contents)
request.meta["item"] = item
yield request
#items.append(item)
#return items
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = site.xpath('.//div[#itemprop = "description"]/text()').extract()
return item
Taking out the parse_dir_contents and uncommenting the empty "lists" list and "append" code was the original code.
Well, as #tayfun suggests you should use response.xpath or define the site variable.
By the way, you do not need to use sel = Selector(response). Responses come with the xpath function, there is no need to cover it into another selector.
However the main problem is that you restrict the domain of the spider. You define allowed_domains = ["jobs.monster.com"] however if you look at the URL to follow of your custom Request you can see that they are something like http://jobview.monster.com/ or http://job-openings.monster.com. In this case your parse_dir_contents is not executed (the domain is not allowed) and your item does not get returned so you won't get any results.
Change allowed_domains = ["jobs.monster.com"] to
allowed_domains = ["monster.com"]
and you will be fine and your app will work and return items.
You have an error in your parse_dir_contents method:
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = response.xpath('.//div[#itemprop=description"]/text()').extract()
return item
Note the use of response. I don't know where you got site that you are currently using from.
Also, try to provide the error details when you post a question. Writing "it fails to work" doesn't say much.

how to scrape Tripadvisor dynamically using scrapy and python

I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()

Categories

Resources