Im scraping a website which has different rows base on the type of item that Im scraping. I have a working scraper that looks like the 1st blockcode below, however, I would like to be able to take a type from the database and send from the start_requests(self) to the parse function. I have 11 different types, that all have different number of rows for one table on some part of the page, whereas the rest of the rows in the other tables on the page are the same. I have tried showing the code in the 2nd blockcode.
How do I accomplish taking the type from the database in the start_requests, and sending it to parse?
1st blockcode
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID FROM dbo.infostage")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
yield self.make_requests_from_url(url+row[0])
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = ''.join(response.url)
id = InfoID[29:len(InfoID)-1]
for info in infodata:
item = infoItem()
# Details
item['id'] = id #response.url
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
itemPool.append(item)
yield item
pass
2nd blockcode
This does not work, but Im not sure how to get it working. Do I create a global list, a new function?
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, type FROM dbo.infostage")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
type = row[1] # how do I send this value to the parse function?
yield self.make_requests_from_url(url+row[0])
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input base path
itemPool = []
InfoID = ''.join(response.url)
id = InfoID[29:len(InfoID)-1]
for info in infodata:
item = infoItem()
# Details
item['id'] = id #response.url
# Here I need to implement a condition that comes from def start_requests(self).
# If condition meet then scrape the following fields else the next
if type = 'type1':
# This is where I would like to use it.
# I have 11 different types, that all have different number of rows for one table on some part of the page, whereas the rest of the rows in the other tables on the page are the same.
# Type 1
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
else:
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
itemPool.append(item)
yield item
pass
Thank you all for your help and insight!
You can use request.meta
def make_requests_from_url(self, url, type, callback):
request = scrapy.Request(url, callback)
request.meta['type'] = type
return request
In parse you can access type using response.meta['type']
Related
I am trying to figure out if my scrapy tool is correctly hitting the product_link for the request callback - 'yield scrapy.Request(product_link, callback=self.parse_new_item)'
product_link should be 'https://www.antaira.com/products/10-100Mbps/LNX-500A'
but I have not been able to confirm if my program is jumping into the next step created so that I can retrieve the correct yield return. Thank you!
# Import the required libraries
import scrapy
# Import the Item class with fields
# mentioned int he items.py file
from ..items import AntairaItem
# Spider class name
class productJumper(scrapy.Spider):
# Name of the spider
name = 'productJumper'
# The domain to be scraped
allowed_domains = ['antaira.com']
# The URLs to be scraped from the domain
start_urls = ['https://www.antaira.com/products/10-100Mbps']
#target_url = ['https://www.antaira.com/products/10-100Mbps/LNX-500A']
# First Step: Find every div with the class 'product-container' and step into the links
def parse(self, response):
#product_link = response.urljoin(rel_product_link)
# creating items dictionary
items = AntairaItem()
rel_product_link = response.css('div.center767')
for url in rel_product_link:
rel_product_link = response.xpath('//div[#class="product-container"]//a/#href').get(),
product_link = response.urljoin('rel_product_link'),
items['rel_product_link'] = rel_product_link,
items['product_link'] = product_link
#yield items
# 2nd Step: Return a list of the all products-links that will be scrapped
#yield {
# take the first relative product link
# 'rel_product_link' : rel_product_link,
# 'product_link' : product_link,
#}
yield scrapy.Request(product_link, callback=self.parse_new_item)
# Final Step: Run through each product and Yield the results
def parse_new_item(self, response):
for product in response.css('main.products'):
name = product.css(('h1.product-name::text').strip(' \t\n\r')).get()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
You have a couple of issues:
scrapy items are essentially dictionaries and are therefore mutable. You need to create a unique item for each and every yield statement.
your second parse callback is referencing a variable items that it doesn't have access too because it was defined in your first parse callback.
In your urljoin method you are using a string literal instead of a variable for rel_product_link
In the example below I fixed those issues and made some additional notes
import scrapy
from ..items import AntairaItem
class ProductJumper(scrapy.Spider): # classes should be TitleCase
name = 'productJumper'
allowed_domains = ['antaira.com']
start_urls = ['https://www.antaira.com/products/10-100Mbps']
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css(('h1.product-name::text').get().strip()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
I am working in extracting the table from this site. Although I matched the xpaths and spotted the fields of the table, I'm not able to extract any content from the site, this is how my spider looks like:
# -*- coding: utf-8 -*-
import scrapy
from table.items import TableItem
class Table(scrapy.Spider):
name = "table1"
start_urls = (
'wesite.com',
)
#//div[4]//div[1]//div[1]//table[1]
#
def parse(self, response):
sites = response.xpath('//*[#id="tabs-1"]/table//tr')[1:-2]
print('\n***********************************\n',sites)
for site in sites:
item = TableItem()
item['col1'] = site.xpath('td[1]/text()').extract()
item['col2'] = site.xpath('td[2]/text()').extract()
yield item
print('\n**********\n',item)
I guess that my main problem is this line:
sites = response.xpath('//*[#id="tabs-1"]/table[1]/tr')
I actually can retrive the content. However, it has an very large repeated incorrect format (it is malformed). Any idea of how to get the table?.
Sometimes browsers add their own DOM elements while rendering. For your given site, the right xpath selector is response.xpath('//*[#id="tabs-1"]/table//tr') to find table rows.
Edited: Added code to fetch the right elements from the table
# -*- coding: utf-8 -*-
import scrapy
from table.items import TableItem
class Table(scrapy.Spider):
name = "table1"
start_urls = (
'http://www.accessdata.fda.gov/scripts/drugshortages/default.cfm#tabs-1',
)
def parse(self, response):
sites = response.xpath('//*[#id="tabs-1"]/table//tr')
for site in sites:
item = TableItem()
item['col1'] = site.xpath('td/a/text()').extract_first()
col2 = site.xpath('td/em/strong/text()')
if col2:
item['col2'] = site.xpath('td/em/strong/text()')[0].extract().strip()
else:
item['col2'] = 'Not Available'
yield item
Im trying to scrape details from a subsite and merge with the details scraped with site. I've been researching through stackoverflow, as well as documentation. However, I still cant get my code to work. It seems that my function to extract additional details from the subsite does not work. If anyone could take a look I would be very grateful.
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, category FROM dbo.StageItem")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
InfoID = row[0]
category = row[1]
yield self.make_requests_from_url(url+InfoID, InfoID, category, self.parse)
def make_requests_from_url(self, url, InfoID, category, callback):
request = Request(url, callback)
request.meta['InfoID'] = InfoID
request.meta['category'] = category
return request
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = response.meta['InfoID']
category = response.meta['category']
for info in infodata:
item = infoItem()
item_cur, item_hist = InfoItemSubSite()
# Stem Details
item['id'] = InfoID
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item_cur['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item_cur['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()
# Extract additional information about item_cur from refering site
# This part does not work
if item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract():
url = 'http://www.nevermind.com/info/sub/' + item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()[0]
request = Request(url, housingtype, self.parse_item_sub)
request.meta['category'] = category
yield self.parse_item_sub(url, category)
item_his['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_his['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
item_his['field7'] = info.xpath('tr[7]/td[2]/p/b/#href').extract()
item['subsite_dic'] = [dict(item_cur), dict(item_his)]
itemPool.append(item)
yield item
pass
# Function to extract additional info from the subsite, and return it to the original item.
def parse_item_sub(self, response, category):
hxs = Selector(response)
subsite = hxs.xpath('div/div[2]') # input base path
category = response.meta['category']
for i in subsite:
item = InfoItemSubSite()
if (category == 'first'):
item['subsite_field1'] = i.xpath('/td[2]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[2]/text()').extract()
item['subsite_field3'] = i.xpath('/div[5]/a[1]/#href').extract()
else:
item['subsite_field1'] = i.xpath('/tr[10]/td[3]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[1]/text()').extract()
item['subsite_field3'] = i.xpath('/div[7]/a[1]/#href').extract()
return item
pass
I've been looking at these examples together with a lot of other examples (stackoverflow is great for that!), as well as scrapy documentation, but still unable to understand how I get details send from one function and merged with the scraped items from the original function.
how do i merge results from target page to current page in scrapy?
How can i use multiple requests and pass items in between them in scrapy python
What you are looking here is called request chaining. Your problem is - yield one item from several requests. A solution to this is to chain requests while carrying your item in requests meta attribute.
Example:
def parse(self, response):
item = MyItem()
item['name'] = response.xpath("//div[#id='name']/text()").extract()
more_page = # some page that offers more details
# go to more page and take your item with you.
yield Request(more_page,
self.parse_more,
meta={'item':item})
def parse_more(self, response):
# get your item from the meta
item = response.meta['item']
# fill it in with more data and yield!
item['last_name'] = response.xpath("//div[#id='lastname']/text()").extract()
yield item
import scrapy
from mystocks.items import MystocksItem
from scrapy.selector import Selector
import datetime
class Synergyspider(scrapy.Spider):
name = "synergyspider"
allowed_domains = ["http://live.mystocks.co.ke/price_list/"]
def parse(self, response):
sel = Selector(response)
head = sel.xpath('//*[#id="main"]/h2')
rows_r1 = sel.xpath('//tr[#class = "row r1"]')
items = []
for row in rows_r1:
item = MystocksItem()
item['date'] = head.xpath('text()').extract()[0]
item['code'] = rows_r1.xpath('./td[1]/a/text()').extract()[0]
item['name'] = rows_r1.xpath('./td[2]/text()').extract()[0]
item['last12_low'] = rows_r1.xpath('./td[3]/text()').extract()[0]
item['last12_high'] = rows_r1.xpath('./td[4]/text()').extract()[0]
#item['day_low'] = rows_r1.xpath('./td[5]/text()').extractf()[0]
item['day_high'] = rows_r1.xpath('./td[6]/text()').extract()[0]
item['price'] = rows_r1.xpath('./td[7]/text()').extract()[0]
item['previous'] = rows_r1.xpath('./td[8]/text()').extract()[0]
item['change'] = rows_r1.xpath('./td[9]/text()').extract()[0]
item['percentChange'] = rows_r1.xpath('./td[10]/text()').extract()[0]
item['volume'] = rows_r1.xpath('./td[12]/text()').extract()[0]
item['adjustedPrice'] = rows_r1.xpath('./td[13]/text()').extract()[0]
items.append(item)
return items
The url I was trying to parse is http://live.mystocks.co.ke/price_list/20140402
I want to iterate over the rows with classes "row r1" and "row r0" but skip the rows with class "row".
found out i was loping on the wrong variable, loop on row not row_r1
I am on this page: http://www.metacritic.com/browse/games/title/ps4/a?view=condensed
And I want to go into each item and get the Developer and Genre, but my code doesn't seem to work.
For example, I want to go into this page: http://www.metacritic.com/game/playstation-4/angry-birds-star-wars
Then leave it and continue through the rest doing the same and adding to a database. What can I change in my code to make it work? Right now the database is for the dev and genre is null but it gets the rest of the data so it's like it never enters parse_Game
Also I added print statements into parseGame and none of them print
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from metacritic.items import MetacriticItem
import MySQLdb
import re
from string import lowercase
class MetacriticSpider(BaseSpider):
def start_requests(self):
#iterate through ps4 pages
for c in lowercase:
for i in range(self.max_id):
yield Request('http://www.metacritic.com/browse/games/title/ps4/{0}?page={1}'.format(c, i), callback = self.parseps4)
#gets the developer and genre of a game
def parseGame(self, response):
print("Here")
item = response.meta['item']
db1 = MySQLdb.connect("localhost", "root", "andy", "metacritic")
cursor = db1.cursor()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="product_wrap"]')
items = []
item['dev'] = site.xpath('.//span[contains(#class, "summary_detail developer")]/span[1]/text()').extract()
item['genre'] = site.xpath('.//span[contains(#class, "summary_detail product_genre")]/span[1]/text()').extract()
cursor.execute("INSERT INTO ps4 (dev, genre) VALUES (%s,%s)",[item['dev'][0],item['genre'][0]])
items.append(item)
print item['dev']
print item['genre']
def parseps4(self, response):
#some local variables
db1 = MySQLdb.connect("localhost", "root", "andy", "metacritic")
cursor = db1.cursor()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="product_wrap"]')
items = []
#iterates through each site
for site in sites:
with db1:
item = MetacriticItem()
#sets the item
item['title'] = site.xpath('.//div[contains(#class, "basic_stat product_title")]/a/text()').extract()
item['cscore'] = site.xpath('.//div[contains(#class, "basic_stat product_score brief_metascore")]/div[1]/text()').extract()
item['uscore'] = site.xpath('.//div/ul/li/span[contains(#class, "data textscore")]/text()').extract()
item['release'] = site.xpath('.//li[contains(#class, "stat release_date full_release_date")]/span[2]/text()').extract()
#some processing to check if there is a score attached, if there is, it adds it to the database
if ("tbd" in item['cscore'][0] and "tbd" not in item['uscore'][0]) or ("tbd" not in item['cscore'][0] and "tbd" in item['uscore'][0]) or ("tbd" not in item['cscore'][0] and "tbd" not in item['uscore'][0]):
cursor.execute("INSERT INTO ps4 (title, criticalscore, userscore, releasedate) VALUES (%s,%s,%s, %s)",[(' '.join(item['title'][0].split())).replace("(PS4)","",1),item['cscore'][0],item['uscore'][0],item['release'][0]])
items.append(item)
itemLink = site.xpath('.//div[contains(#class, "basic_stat product_title")]/a/#href' ).extract()
req = Request('http://www.metacritic.com' + itemLink[0], callback = self.parseGame)
req.meta['item'] = item
Several problems in the code:
meta argument should contain a dictionary {'item': item}
HtmlXPathSelector is deprecated - use Selector instead
I think you shouldn't do mysql inserts inside the spider - use Database Pipeline instead:
Writing items to a MySQL database in Scrapy
you need to get the first item of extract() call and do strip() on it (this will help to have strings in the Fields, not lists and without leading and trailing spaces and newlines)
Here's the code without mysql related calls:
from string import lowercase
from scrapy.item import Field, Item
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector, Selector
from metacritic.items import MetacriticItem
class MetacriticSpider(BaseSpider):
name = 'metacritic'
allowed_domains = ['metacritic.com']
max_id = 1 # your max_id value goes here!!!
def start_requests(self):
for c in lowercase:
for i in range(self.max_id):
yield Request('http://www.metacritic.com/browse/games/title/ps4/{0}?page={1}'.format(c, i), callback=self.parseps4)
def parseGame(self, response):
item = response.meta['item']
hxs = HtmlXPathSelector(response)
site = hxs.select('//div[#class="product_wrap"]')
# get additional data!!!
yield item
def parseps4(self, response):
hxs = Selector(response)
sites = hxs.select('//div[#class="product_wrap"]')
for site in sites:
item = MetacriticItem()
item['title'] = site.xpath('.//div[contains(#class, "basic_stat product_title")]/a/text()').extract()[0].strip()
item['cscore'] = site.xpath('.//div[contains(#class, "basic_stat product_score brief_metascore")]/div[1]/text()').extract()[0].strip()
item['uscore'] = site.xpath('.//div/ul/li/span[contains(#class, "data textscore")]/text()').extract()[0].strip()
item['release'] = site.xpath('.//li[contains(#class, "stat release_date full_release_date")]/span[2]/text()').extract()[0].strip()
link = site.xpath('.//div[contains(#class, "basic_stat product_title")]/a/#href').extract()[0]
yield Request('http://www.metacritic.com/' + link, meta={'item': item}, callback=self.parseGame)
It works for me - I see the yielded items from parseGame() on a console.
Make sure it yields items first, then see the !!! comments - fill these lines accordingly.
After that, if you see items on a console, try creating a database pipeline to write items to mysql.