Problems while scraping a table from a website? - python

I am working in extracting the table from this site. Although I matched the xpaths and spotted the fields of the table, I'm not able to extract any content from the site, this is how my spider looks like:
# -*- coding: utf-8 -*-
import scrapy
from table.items import TableItem
class Table(scrapy.Spider):
name = "table1"
start_urls = (
'wesite.com',
)
#//div[4]//div[1]//div[1]//table[1]
#
def parse(self, response):
sites = response.xpath('//*[#id="tabs-1"]/table//tr')[1:-2]
print('\n***********************************\n',sites)
for site in sites:
item = TableItem()
item['col1'] = site.xpath('td[1]/text()').extract()
item['col2'] = site.xpath('td[2]/text()').extract()
yield item
print('\n**********\n',item)
I guess that my main problem is this line:
sites = response.xpath('//*[#id="tabs-1"]/table[1]/tr')
I actually can retrive the content. However, it has an very large repeated incorrect format (it is malformed). Any idea of how to get the table?.

Sometimes browsers add their own DOM elements while rendering. For your given site, the right xpath selector is response.xpath('//*[#id="tabs-1"]/table//tr') to find table rows.
Edited: Added code to fetch the right elements from the table
# -*- coding: utf-8 -*-
import scrapy
from table.items import TableItem
class Table(scrapy.Spider):
name = "table1"
start_urls = (
'http://www.accessdata.fda.gov/scripts/drugshortages/default.cfm#tabs-1',
)
def parse(self, response):
sites = response.xpath('//*[#id="tabs-1"]/table//tr')
for site in sites:
item = TableItem()
item['col1'] = site.xpath('td/a/text()').extract_first()
col2 = site.xpath('td/em/strong/text()')
if col2:
item['col2'] = site.xpath('td/em/strong/text()')[0].extract().strip()
else:
item['col2'] = 'Not Available'
yield item

Related

How to scrape and infinity scrolling page?

I was trying to scrape the men's coats and jackets category in next.co.uk and I realized that the page has the infinity scrolling page
# -*- coding: utf-8 -*-
import scrapy
from ..items import NextItem
class NewoneSpider(scrapy.Spider):
name = 'newOne'
allowed_domains = ['www.next.co.uk']
start_urls = [
'https://www.next.co.uk/shop/gender-newbornboys-gender-newbornunisex-gender-olderboys-gender-youngerboys-productaffiliation-coatsandjackets-0'
]
def parse(self, response):
items = NextItem();
global productCategory
global productSubCategory
products = response.css('.Details')
currentUrl = response.request.url
for product in products:
productCategory = 'Furniture'
productSubCategory = 'living Room'
productCountry = 'uk'
productSeller = 'John Lewis'
productLink = product.css('.TitleText::attr(href)').extract_first()
productTitle = product.css('.Desc::text').extract_first()
productImage = product.css('.Image img::attr(src)').extract_first()
productSalePrice = product.css('.Price a::text').extract_first()
items['productCategory'] = productCategory
items['productSubCategory'] = productSubCategory
items['productCountry'] = productCountry
items['productSeller'] = productSeller
items['productLink'] = productLink
items['productTitle'] = productTitle
items['productImage'] = productImage
items['productSalePrice'] = productSalePrice
yield items
I was able to scrape 28 items and I can see more than that on on the website which has infinite scrolling implementation.
When you scroll down the page sends XHR call to the server and asks for more data.
Example:
https://www.next.co.uk/shop/gender-newbornboys-gender-newbornunisex-gender-olderboys-gender-youngerboys-productaffiliation-coatsandjackets/isort-score-minprice-0-maxprice-30000-srt-24
Each request is almost the same but the last element in the url is growing up by 24:
srt-24
srt-48
srt-72
Now that you know how the "infinity" works, you can try and simulate it with code.
Example:
import requests
URL_TEMPLATE = 'https://www.next.co.uk/shop/gender-newbornboys-gender-newbornunisex-gender-olderboys-gender-youngerboys-productaffiliation-coatsandjackets/isort-score-minprice-0-maxprice-30000-srt-{}'
for step in range(24, 240, 24):
r = requests.get(URL_TEMPLATE.format(step))
if r.status_code == 200:
# TODO We have the data - lets parse it
pass
if i have two links i want to scrape from like
https://www.next.co.uk/shop/gender-newbornboys-gender-newbornunisex-gender-olderboys-gender-youngerboys-productaffiliation-coatsandjackets-0
and
https://www2.next.co.uk/shop/gender-men-productaffiliation-coatsandjackets-0
how would the code be like

Scrapy send condition to parse from start_requests(self)

Im scraping a website which has different rows base on the type of item that Im scraping. I have a working scraper that looks like the 1st blockcode below, however, I would like to be able to take a type from the database and send from the start_requests(self) to the parse function. I have 11 different types, that all have different number of rows for one table on some part of the page, whereas the rest of the rows in the other tables on the page are the same. I have tried showing the code in the 2nd blockcode.
How do I accomplish taking the type from the database in the start_requests, and sending it to parse?
1st blockcode
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID FROM dbo.infostage")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
yield self.make_requests_from_url(url+row[0])
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = ''.join(response.url)
id = InfoID[29:len(InfoID)-1]
for info in infodata:
item = infoItem()
# Details
item['id'] = id #response.url
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
itemPool.append(item)
yield item
pass
2nd blockcode
This does not work, but Im not sure how to get it working. Do I create a global list, a new function?
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, type FROM dbo.infostage")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
type = row[1] # how do I send this value to the parse function?
yield self.make_requests_from_url(url+row[0])
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input base path
itemPool = []
InfoID = ''.join(response.url)
id = InfoID[29:len(InfoID)-1]
for info in infodata:
item = infoItem()
# Details
item['id'] = id #response.url
# Here I need to implement a condition that comes from def start_requests(self).
# If condition meet then scrape the following fields else the next
if type = 'type1':
# This is where I would like to use it.
# I have 11 different types, that all have different number of rows for one table on some part of the page, whereas the rest of the rows in the other tables on the page are the same.
# Type 1
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
else:
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
itemPool.append(item)
yield item
pass
Thank you all for your help and insight!
You can use request.meta
def make_requests_from_url(self, url, type, callback):
request = scrapy.Request(url, callback)
request.meta['type'] = type
return request
In parse you can access type using response.meta['type']

Scrapy correct xpath for unnamed div with image and text

I am building a Spider that traverses through several paginated pages and extracts data from the site:
http://www.usnews.com/education/best-global-universities/neuroscience-behavior
This is the spider:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
from usnews.items import UsnewsItem
class UniversitiesSpider(scrapy.Spider):
name = "universities"
allowed_domains = ["usnews.com"]
start_urls = (
'http://www.usnews.com/education/best-global-universities/neuroscience-behavior/',
)
#Rules = [
#Rule(LinkExtractor(allow=(), restrict_xpaths=('.//a[#class="pager_link"]',)), callback="parse", follow= True)
#]
def parse(self, response):
for sel in response.xpath('.//div[#class="sep"]'):
item = UsnewsItem()
item['name'] = sel.xpath('.//h2[#class="h-taut"]/a/text()').extract()
item['location'] = sel.xpath('.//span[#class="t-dim t-small"]/text()').extract()
item['ranking'] = sel.xpath('.//div[3]/div[2]/text()').extract()
item['score'] = sel.xpath('.//div[#class="t-large t-strong t-constricted"]/text()').extract()
#print(sel.xpath('.//text()').extract()
yield item
I am having problems extracting the text for the item "ranking". According to google chomes xpath suggestion the xpath is: //*[#id="resultsMain"]/div[1]/div[1]/div[3]/div[2] which gives me the single number for the first entry and a bunch of empty values. It seems to be implemented inside an img tag and I am confused on how to access it to just extract the thext (for example #1, #22 etc.)
The following XPath should find div containing img child, and then return non-empty text node child which contains the 'ranking' :
for sel in response.xpath('.//div[#class="sep"]'):
...
item['ranking'] = sel.xpath('div/div[img]/text()[normalize-space()]').extract()

scrapy SgmlLinkExtractor scrape Master and Detail pages

I am trying to extract information from Listing and Detail pages.
The code below correctly scrapes the reviewer information from the Listing page and all linked pages (where a contains Next)
The detail_pages Urls are also captured. e.g. http://www.screwfix.com/p/prysmian-6242y-twin-earth-cable-2-5mm-x-100m-grey/20967
However I cannot see how I can navigate to and scrape the information from the Detail pages.
Is there anyone here who used Scrapy successfully who can help me to finish this spider?
Thank you for the help.
I include the code for the spider below:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from hn_scraper.items import HnArticleItem
class ScrewfixSpider(Spider):
name = "Screwfix"
allowed_domains = ["www.screwfix.com"]
start_urls = ('http://www.screwfix.com/', )
link_extractor = SgmlLinkExtractor(
allow=('www', ),
restrict_xpaths=('//a[contains(., "Next")]', ))
detail_page_extractor = SgmlLinkExtractor(
allow=('www', ),
restrict_xpaths=('//tr[#id[contains(., "reviewer")]]/td[3]/a', ))
def extract_one(self, selector, xpath, default=None):
extracted = selector.xpath(xpath).extract()
if extracted:
return extracted[0]
return default
def parse(self, response):
for link in self.link_extractor.extract_links(response):
request = Request(url=link.url)
request.meta.update(link_text=link.text)
yield request
for item in self.parse_item(response):
yield item
def parse_item(self, response):
selector = Selector(response)
rows = selector.xpath('//table[contains(.,"crDataGrid")]//tr[#id[contains(., "reviewer")]]')
for row in rows:
item = HnArticleItem()
reviewer = row.xpath('td[3]/a')
reviewer_url = self.extract_one(reviewer, './#href', '')
reviewer_name = self.extract_one(reviewer, 'b/text()', '')
total_reviews = row.xpath('td[4]/text()').extract()
item['url'] = reviewer_url
item['name'] = reviewer_name
item['total_reviews'] = total_reviews
yield item
detail_pages = self.detail_page_extractor.extract_links(response)
if detail_pages:
print 'detail_pages'
print detail_pages[0].url
yield Request(detail_pages[0].url)

Advice extracting //td text and numbers

I have been working through the tutorial adapting it to a project I want to achieve. I seem to have something going wrong that i just can't find the error to.
When using 'scrapy shell' I can get the response I expect. So for this site Nrl Ladder
In [1]: hxs.select('//td').extract()
Out[1]:
[u'<td>\r\n<div id="ls-nav">\r\n<ul><li><span>Home</span></li>\r\n<li class="ls-nav-on"><span>NRL</span></li>\r\n<li><span>NYC</span></li>\r\n<li><span>Rep Matches</span></li>\r\n\r\n</ul></div>\r\n</td>',
u'<td style="text-align:left" colspan="5">Round 4</td>',
u'<td colspan="5">Updated: 26/3/2012</td>',
u'<td style="text-align:left">1. Melbourne</td>',
u'<td>4</td>',
u'<td>4</td>',
u'<td>0</td>',
u'<td>0</td>',
u'<td>0</td>',
u'<td>122</td>',
u'<td>39</td>',
u'<td>83</td>',
u'<td>8</td>',
u'<td style="text-align:left">2. Canterbury-Bankstown</td>',
And on it goes.
I am really struggling to understand how to alter the tutorial project to change it to a different data type.
Is there anyway to bring up a help or documentation list to see what types I should use in items when using 'td' or any other item. Like i say it works easy in the shell but I cannot transform it to the files. Specifically both the team names and the points are 'td' but the team name is text.
here is what I have done.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from nrl.items import NrlItem
class nrl(BaseSpider):
name = "nrl"
allowed_domains = ["http://live.nrlstats.com/"]
start_urls = [
"http://live.nrlstats.com/nrl/ladder.html",
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//td')
items = []
for site in sites:
item = nrlItem()
item['team'] = site.select('/text()').extract()
item['points'] = site.select('/').extract()
items.append(item)
return items
I didn't quite understand your question, but here is a starting point, imo (haven't tested; see some comments in the code):
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from nrl.items import NrlItem
class nrl(BaseSpider):
name = "nrl"
allowed_domains = ["live.nrlstats.com"] # domains should be like this
start_urls = [
"http://live.nrlstats.com/nrl/ladder.html",
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tabler"]//tr[starts-with(#class, "r")]') # select team rows
items = []
for row in rows:
item = nrlItem()
columns = row.select('./td/text()').extract() # select columns for the selected row
item['team'] = columns[0]
item['P'] = int(columns[1])
item['W'] = int(columns[2])
...
items.append(item)
return items
UPDATE:
//table[#class="tabler"//tr[starts-with(#class, "r")] is an xpath query. See some xpath examples here.
hxs.select(xpath_query) always returns a list of nodes (also of type HtmlXPathSelector) which fall under the given query.
hxs.extract() returns string representation of the node(s).
P.S. Beware that scrapy supports XPath 1.0, but not 2.0 (at least on Linux, not sure about Windows), so some of the newest xpath features might not work.
See also:
http://doc.scrapy.org/en/latest/topics/selectors.html
http://doc.scrapy.org/en/latest/topics/firefox.html

Categories

Resources