Scrapy Table column and rows doesn't work - python

I want to scrapy the table of this page, but the scrapped data is only in one column, and in some case the data doesn't appear. Also I use the shell to see if the Xpath is correct (I use the Xpath helper to identify these xpath)
import scrapy
class ToScrapeSpiderXPath(scrapy.Spider):
name = 'scrape-xpath'
start_urls = [
'http://explorer.eu/contents/food/28?utf8=/',
]
def parse(self, response):
for flv in response.xpath('//html/body/main/div[4]'):
yield {
'Titulo': flv.xpath('//*#id="chromatography"]/table/tbody/tr[3]/th/strong/a/text()"]/tbody/tr[5]/td[3]/a[2]').extract(),
'contenido': flv.xpath('//*#id="chromatography"]/table/tbody/tr[5]/td[3]/a[2]/text()').extract(),
'clase': flv.xpath('//*[#id="chromatography"]/table/tbody/tr[5]/td[1]/text()').extract(),
'Subclase': flv.xpath('//*[#id="chromatography"]/table/tbody/tr[5]/td[2]/a/text').extract(),
}

From the example URL given, it's not exactly obvious what the values should be and how extraction should generalize for page containing more records. So I tried a different page with multiple records, let's see if the result gets you what you need. Here's ready to run code:
# -*- coding: utf-8 -*-
import scrapy
class PhenolExplorerSpider(scrapy.Spider):
name = 'phenol-explorer'
start_urls = ['http://phenol-explorer.eu/contents/food/29?utf8=/']
def parse(self, response):
chromatography = response.xpath('//div[#id="chromatography"]')
title = chromatography.xpath('.//tr/th[#class="outer"]/strong/a/text()').extract_first()
for row in chromatography.xpath('.//tr[not(#class="header")]'):
class_ = row.xpath('./td[#rowspan]/text()').extract_first()
if not class_:
class_ = row.xpath('./preceding-sibling::tr[td[#rowspan]][1]/td[#rowspan]/text()').extract_first()
subclass = row.xpath('./td[not(#rowspan)][1]/a/text()').extract_first()
#content = row.xpath('./td[not(#rowspan)][2]/a[2]/text()').extract_first()
content = row.xpath('./td[not(#rowspan)][2]/text()').extract_first()
yield {
'title': title.strip(),
'class': class_.strip(),
'subclass': subclass.strip(),
'content': content.strip(),
}
Basically, it iterates over individual rows of the table and extracts the data from corresponding fields, yielding an item once complete information is collected.

Try this:
for row in response.css('#chromatography table tr:not(.header)'):
yield {'titulo': row.xpath('./preceding-sibling::tr/th[contains(#class, "outer")]//a/text()').extract_first().strip(),
'clase': row.xpath('./preceding-sibling::tr/th[contains(#class, "inner")]//text()').extract_first().strip(),
'subclase': row.xpath('./td[2]//text()').extract_first().strip(),
'contenido': row.css('.content_value a::text').extract_first().strip()}
remember that the inner loop selectors should also be relative to node flv in your case, selecting with // is a global selector so it'll grab everything.
It's also better to inspect the real html code, because the browser might render some other code different to the actual html received (for example the tbody tags)

Related

Scrapy one item with multiple parsing functions

I am using Scrapy with python to scrape a website and I face some difficulties with filling the item that I have created.
The products are properly scraped and everything is working well as long as the info is located within the response.xpath mentioned in the for loop.
'trend' and 'number' are properly added to the Item using ItemLoader.
However, the date of the product is not located within the response.xpath cited below but in the response.css as a title : response.css('title')
import scrapy
import datetime
from trends.items import Trend_item
from scrapy.loader import ItemLoader
#Initiate the spider
class trendspiders(scrapy.Spider):
name = 'milk'
start_urls = ['https://thewebsiteforthebestmilk/ireland/2022-03-16/7/']
def parse(self, response):
for milk_unique in response.xpath('/html/body/main/div/div[2]/div[1]/section[1]/div/div[3]/table/tbody/tr'):
l = ItemLoader(item=Milk_item(), selector=milk_unique, response=response)
l.add_css('milk', 'a::text')
l.add_css('number', 'span.small.text-muted::text')
return l.load_item()
How can I add the 'date' to my item please (found in response.css('title')?
I have tried to add l.add_css('date', "response.css('title')")in the for loop but it returns an error.
Should I create a new parsing function? If yes then how to send the info to the same Item?
I hope I’ve made myself clear.
Thank you very much for your help,
Since the date is outside of the selector you are using for each row, what you should do is extract that first before your for loop, since it doesn't need to be updated on each iteration.
Then with your item loader you can just use l.add_value to load it with the rest of the fields.
For example:
class trendspiders(scrapy.Spider):
name = 'trends'
start_urls = ['https://getdaytrends.com/ireland/2022-03-16/7/']
def parse(self, response):
date_str = response.xpath("//title/text()").get()
for trend_unique in response.xpath('/html/body/main/div/div[2]/div[1]/section[1]/div/div[3]/table/tbody/tr'):
l = ItemLoader(item=Trend_item(), selector=trend_unique, response=response)
l.add_css('trend', 'a::text')
l.add_css('number', 'span.small.text-muted::text')
l.add_value('date', date_str)
yield l.load_item()
If response.css('title').get() gives you the answer you need, why not use the same CSS with add_css:
l.add_css('date', 'title')
Also, .add_css('date', "response.css('title')") is invalid because the second argument a valid CSS selector.

Scrapy not following the next parse function

I am trying to write a simple scraping script to scrape off google summer of code orgs with the tech that I require. Its work in progress. My parse function is working fine but whenever I callback into org function it doesn't throw any output.
# -*- coding: utf-8 -*-
import scrapy
class GsocSpider(scrapy.Spider):
name = 'gsoc'
allowed_domains = ['https://summerofcode.withgoogle.com/archive/2018/organizations/']
start_urls = ['https://summerofcode.withgoogle.com/archive/2018/organizations/']
def parse(self, response):
for href in response.css('li.organization-card__container a.organization-card__link::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback = self.parse_org)
def parse_org(self,response):
tech=response.css('li.organization__tag organization__tag--technology::text').extract()
#if 'python' in tech:
yield
{
'name':response.css('title::text').extract_first()
#'ideas_list':response.css('')
}
first of all, you are configuring incorrectly the allowed_domains, as it specifies in the documentation:
An optional list of strings containing domains that this spider is
allowed to crawl. Requests for URLs not belonging to the domain names
specified in this list (or their subdomains) won’t be followed if
OffsiteMiddleware is enabled.
Let’s say your target url is https://www.example.com/1.html, then add
'example.com' to the list.
As you can see, you need to include only the domains, and this is a filtering functionality (so other domains don't get crawled). Also this is optional, so I would actually recommend to not include it.
Also your css for getting tech is incorrect, it should be:
li.organization__tag.organization__tag--technology

wrong Xpath in IMDB spider scrapy

Here:
IMDB scrapy get all movie data
response.xpath("//*[#class='results']/tr/td[3]")
returns empty list. I tried to change it to:
response.xpath("//*[contains(#class,'chart full-width')]/tbody/tr")
without success.
Any help please? Thanks.
I did not have time to go through IMDB scrapy get all movie data thoroughly, but have got the gist of it. The Problem statement is to get All movie data from the given site. It involves two things. First is to go through all the pages that contain the list of all the movies of that year. While the Second one is to get the link to each movie and then here you do your own magic.
The problem you faced is with the getting the xpath for the link to each movies. This may most likely be due to change in the website structure (I did not have time to verify what maybe the difference). Anyways, following is the xpath you would require.
FIRST :
We take div class nav as a landmark and find the lister-page-next next-page class in its children.
response.xpath("//div[#class='nav']/div/a[#class='lister-page-next next-page']/#href").extract_first()
Here this will give : Link for the next page | returns None if at the last page (since next-page tag not present)
SECOND :
This is the original doubt by the OP.
#Get the list of the container having the title, etc
list = response.xpath("//div[#class='lister-item-content']")
#From the container extract the required links
paths = list.xpath("h3[#class='lister-item-header']/a/#href").extract()
Now all you would need to do is loop through each of these paths element and request the page.
Thanks for your answer. I eventually used your xPath like so:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from crawler.items import MovieItem
IMDB_URL = "http://imdb.com"
class IMDBSpider(CrawlSpider):
name = 'imdb'
# in order to move the next page
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=("//div[#class='nav']/div/a[#class='lister-page-next next-page']",)),
callback="parse_page", follow= True),)
def __init__(self, start=None, end=None, *args, **kwargs):
super(IMDBSpider, self).__init__(*args, **kwargs)
self.start_year = int(start) if start else 1874
self.end_year = int(end) if end else 2017
# generate start_urls dynamically
def start_requests(self):
for year in range(self.start_year, self.end_year+1):
# movies are sorted by number of votes
yield scrapy.Request('http://www.imdb.com/search/title?year={year},{year}&title_type=feature&sort=num_votes,desc'.format(year=year))
def parse_page(self, response):
content = response.xpath("//div[#class='lister-item-content']")
paths = content.xpath("h3[#class='lister-item-header']/a/#href").extract() # list of paths of movies in the current page
# all movies in this page
for path in paths:
item = MovieItem()
item['MainPageUrl'] = IMDB_URL + path
request = scrapy.Request(item['MainPageUrl'], callback=self.parse_movie_details)
request.meta['item'] = item
yield request
# make sure that the start_urls are parsed as well
parse_start_url = parse_page
def parse_movie_details(self, response):
pass # lots of parsing....
Runs it with scrapy crawl imdb -a start=<start-year> -a end=<end-year>

Scrapy Extract ld+JSON

How to extract the name and url?
quotes_spiders.py
import scrapy
import json
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ["http://www.lazada.com.my/shop-power-banks2/?price=1572-1572"]
def parse(self, response):
data = json.loads(response.xpath('//script[#type="application/ld+json"]//text()').extract_first())
//how to extract the name and url?
yield data
Data to Extract
<script type="application/ld+json">{"#context":"https://schema.org","#type":"ItemList","itemListElement":[{"#type":"Product","image":"http://my-live-02.slatic.net/p/2/test-product-0601-7378-08684315-8be741b9107b9ace2f2fe68d9c9fd61a-webp-catalog_233.jpg","name":"test product 0601","offers":{"#type":"Offer","availability":"https://schema.org/InStock","price":"99999.00","priceCurrency":"RM"},"url":"http://www.lazada.com.my/test-product-0601-51348680.html?ff=1"}]}</script>
This line of code returns a dictionary with the data you want:
data = json.loads(response.xpath('//script[#type="application/ld+json"]//text()').extract_first())
All you need to do is to access it like:
name = data['itemListElement'][0]['name']
url = data['itemListElement'][0]['url']
Given that the microdata contains a list you will need to check you are referring to the correct product in the list.
A really easy solution for this would be to use https://github.com/scrapinghub/extruct. It handles all the hard parts of extracting structured data.

Using Scrapy for XML page

I'm trying to scrape multiple pages from an API to practice and develop my XML scrapping. One issue that has arisen is that when I try to scrape a document formatted like this: http://i.imgur.com/zJqeYvG.png and store it as an XML it fails to do so.
So within the CMD it fetches the URL it creates the XML file on my computer but there's nothing in it.
How would I fix it to echo out the whole document or even parts of it?
I put the code below:
from scrapy.spider import BaseSpider
from scrapy.selector import XmlXPathSelector
from doitapi.items import DoIt
import random
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["do-it.org.uk"]
start_urls = []
number = []
for count in range(100):
number.append(random.randint(2000000,2500000))
for i in number:
start_urls.append("http://www.do-it.org.uk/syndication/opportunities/%d?apiKey=XXXXX-XXXX-XXX-XXX-XXXXX" %i)
def parse(self, response):
xxs = XmlXPathSelector(response)
titles = xxs.register_namespace("d", "http://www.do-it.org.uk/volunteering-opportunity")
items = []
for titles in titles:
item = DoIt()
item ["url"] = response.url
item ["name"] = titles.select("//d:title").extract()
item ["description"] = titles.select("//d:description").extract()
item ["username"] = titles.select("//d:info-provider/name").extract()
item ["location"] = titles.select("//d:info-provider/address").extract()
items.append(item)
return items
Your XML file is using the namespace "http://www.do-it.org.uk/volunteering-opportunity" so to select title, name etc. you have 2 choices:
either use xxs.remove_namespaces() once and then use .select("./title"), .select("./description") etc.
or register the namespace once, with a prefix like "doit", xxs.register_namespace("doit", "http://www.do-it.org.uk/volunteering-opportunity"), and then use .select("./doit:title"), .select("./doit:description") etc.
For more details on XML namespaces, see this page in the FAQ and this page in the docs

Categories

Resources