Scrapy only get the data of last page - python

I'm using python 3.6 and scrapy 2.4.1, and I wrote a spider to scrape about 5 pages, then use xlsxwriter to save to excel, however this scarpy only get last page data, can't figure out why, here is my spider code
import scrapy
from scrapy.selector import Selector
from ebay.items import EbayItem
class EbaySpiderSpider(scrapy.Spider):
name = 'ebay_spider'
allowed_domains = ['www.ebay.com.au']
start_urls = ['https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1']
def parse(self, response):
item_price_extract = []
item_title = []
item_title_list = response.xpath('//h3[#class="lvtitle"]/a')
item_href = response.xpath('//h3[#class="lvtitle"]/a/#href').getall()
for title in item_title_list:
item_title_text = title.xpath('string(.)').get()
item_title.append(item_title_text)
item_price = response.xpath('//li[#class="lvprice prc"]//span[#class="bold"]')
for i in range(len(item_price)):
item_price_text = item_price[i].xpath('string(.)').get()
item_price_extract.append(item_price_text.strip())
item_info = EbayItem(title=item_title, price=item_price_extract, item_href=item_href)
yield item_info
next_url_href = response.xpath('//a[#class="gspr next"]/#href').get()
if not next_url_href:
return
else:
yield scrapy.Request(next_url_href, callback=self.parse)
and pipeline code
import xlsxwriter
class EbayPipeline:
def open_spider(self, spider):
pass
def process_item(self, item, spider):
col_num = 0
workbook = xlsxwriter.Workbook(r'C:\Users\Clevo\Desktop\store_spider.xlsx')
worksheet = workbook.add_worksheet()
item_source = dict(item)
# print(item_source)
for key, values in item_source.items():
worksheet.write(0, col_num, key)
worksheet.write_column(1, col_num, values)
col_num += 1
workbook.close()
return item
someone know the reason why? it seems everything is ok, but I can only get last page data
by the way, is there anyway to transfer data to another function? I want to scrapy page detail and transfer the data to process_item function and yield them together

Better scraped every pages first and get data on its product page.
class EbaySpiderSpider(scrapy.Spider):
name = "ebay_spider"
def start_requests(self):
base_url = 'https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs='
for i in range(1,6):
page = base_url + str(i)#i will be the page number and add to base_url
yield scrapy.Request(url=page , callback=self.parse)
# scraped all product links first and yield to parse_contents
def parse(self, response):
links = response.xpath('//h3[#class="lvtitle"]/a/#href').extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_contents)
#scraped desired data on product page
def parse_contents(self, response):
product_url = response.url
title = response.xpath('//h1/text()').extract()[0]
price = response.xpath('//span[#itemprop="price"]/text()').extract()[0]
item = EbayItem()
item['product_title'] = title
item['product_price'] = price
yield item ### to items.py
items.py, make sure that the item keys are equal to scrapy.Field()
class EbayITem(scrapy.Item):
product_title = scrapy.Field()
product_price = scrapy.Field()
pipelines.py
import xlsxwriter
class EbayPipeline:
def process_item(self, item, spider):
title = item['product_title']
price = item['product_price']
#process your worksheet here

Working version of your code
import scrapy
from scrapy.selector import Selector
from ebay.items import EbayItem
class EbaySpiderSpider(scrapy.Spider):
name = 'ebay_spider'
allowed_domains = ['ebay.com.au']
start_urls = ['https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1']
def parse(self, response):
item_price_extract = []
item_title = []
item_title_list = response.xpath('//h3[#class="lvtitle"]/a')
item_href = response.xpath('//h3[#class="lvtitle"]/a/#href').getall()
for title in item_title_list:
item_title_text = title.xpath('string(.)').get()
item_title.append(item_title_text)
item_price = response.xpath('//li[#class="lvprice prc"]//span[#class="bold"]')
for i in range(len(item_price)):
item_price_text = item_price[i].xpath('string(.)').get()
item_price_extract.append(item_price_text.strip())
item_info = EbayItem(title=item_title, price=item_price_extract, item_href=item_href)
yield item_info
next_url_href = response.xpath('//a[#class="gspr next"]/#href').get()
if next_url_href is not None:
next_url_href = response.urljoin(next_url_href)
yield scrapy.Request(next_url_href, callback=self.parse)
You will have to set ROBOTSTXT_OBEY=False in settings.py (which is not a good practice) or else it your spider won't scrape data and will give message:
[scrapy.downloadermiddlewares.robotstxt] DEBUG: Forbidden by robots.txt: <GET https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1>

Related

Scrapy : unable to scrape nested links properly - all the records are duplicating

I'm scraping the portal using Scrapy, Here, I need to get detailed meta data about every product.
So, to get this information - initially I have scraped all the products in given page and looping through each item to get meta data. But All the details are duplicated in the CSV file.
Here is the sample code:
class MyDinSpider(scrapy.Spider):
name = 'abc'
allowed_domains = ['www.abc.com.my']
start_urls = ['http://https://www.abc/online-store/']
def start_requests(self):
with open("./csvFiles/urls.csv", "rU") as f:
reader=csv.DictReader(f)
for row in reader:
url=row['url']
link_urls = [url.format(i) for i in range(2)]
for link_url in link_urls:
print(link_url)
request=Request(link_url, callback=self.parse_product_pages, meta={'Category': row['Category']})
yield request
def parse_product_pages(self,response):
item=MydinscrapingItem()
content=response.css('.variants-pop-over').extract()
for product_content in content:
val = product_content.split('default value=')[1].split('>')[0]
link = "https://abc/products/detail/?pid={}".format(val.split('"ProductExtId": "')[1].split('"')[0])
item['Date'] = datetime.today().date()
item['Title'] = val.split('"ProductName": "')[1].split('"')[0]
item['Price'] = val.split('"price":"')[1].split('"')[0]
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
def parse_page2(self, response):
item = response.meta['item']
item['Category'] = category[:-2]
yield (item)
def parse(self, response):
pass
Here is the output csv file

How to scrape link within site using scrapy

I'm trying to use scrapy to scrape from a site, and a link within the content of the site. However, when I do this I get an error on the line above the yield statemant in parse:
TypeError: 'NoneType' object does not support item assignment
Here is my code:
class PostsSpider(scrapy.Spider):
name = "posts"
start_urls = ['https://www.nba.com/teams/bucks']
allowed_domains = ['nba.com']
def parse(self, response):
for post in response.css('.nba-player-index section section'):
playerPage = response.urljoin(post.css('a').attrib['href'])
item = yield scrapy.Request(playerPage, callback=self.helper)
item['number'] = post.css('span.nba-player-trending-item__number::text').get(),
yield item
def helper(self, response):
print("--->"+response.css("title").get())
item = Item()
item['title'] = response.css("title::text").get()
yield item
class Item(scrapy.Item):
# define the fields for your item here like:
number = scrapy.Field()
title = scrapy.Field()
ppg = scrapy.Field()
What you can do is pass number data to helper instead of doing this way.
Something like this:
def parse(self, response):
for post in response.css('.nba-player-index section section'):
playerPage = response.urljoin(post.css('a').attrib['href'])
meta = response.meta.copy()
meta['number'] = post.css('span.nba-player-trending-item__number::text').get()
yield scrapy.Request(playerPage, callback=self.helper, meta=meta)
def helper(self, response):
# here you will get `number` in response.meta['number'] that you can yield further.
item = Item()
item['number'] = response.meta.get('number)
yield item

Scrapy merge subsite-item with site-item

Im trying to scrape details from a subsite and merge with the details scraped with site. I've been researching through stackoverflow, as well as documentation. However, I still cant get my code to work. It seems that my function to extract additional details from the subsite does not work. If anyone could take a look I would be very grateful.
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, category FROM dbo.StageItem")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
InfoID = row[0]
category = row[1]
yield self.make_requests_from_url(url+InfoID, InfoID, category, self.parse)
def make_requests_from_url(self, url, InfoID, category, callback):
request = Request(url, callback)
request.meta['InfoID'] = InfoID
request.meta['category'] = category
return request
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = response.meta['InfoID']
category = response.meta['category']
for info in infodata:
item = infoItem()
item_cur, item_hist = InfoItemSubSite()
# Stem Details
item['id'] = InfoID
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item_cur['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item_cur['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()
# Extract additional information about item_cur from refering site
# This part does not work
if item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract():
url = 'http://www.nevermind.com/info/sub/' + item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()[0]
request = Request(url, housingtype, self.parse_item_sub)
request.meta['category'] = category
yield self.parse_item_sub(url, category)
item_his['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_his['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
item_his['field7'] = info.xpath('tr[7]/td[2]/p/b/#href').extract()
item['subsite_dic'] = [dict(item_cur), dict(item_his)]
itemPool.append(item)
yield item
pass
# Function to extract additional info from the subsite, and return it to the original item.
def parse_item_sub(self, response, category):
hxs = Selector(response)
subsite = hxs.xpath('div/div[2]') # input base path
category = response.meta['category']
for i in subsite:
item = InfoItemSubSite()
if (category == 'first'):
item['subsite_field1'] = i.xpath('/td[2]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[2]/text()').extract()
item['subsite_field3'] = i.xpath('/div[5]/a[1]/#href').extract()
else:
item['subsite_field1'] = i.xpath('/tr[10]/td[3]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[1]/text()').extract()
item['subsite_field3'] = i.xpath('/div[7]/a[1]/#href').extract()
return item
pass
I've been looking at these examples together with a lot of other examples (stackoverflow is great for that!), as well as scrapy documentation, but still unable to understand how I get details send from one function and merged with the scraped items from the original function.
how do i merge results from target page to current page in scrapy?
How can i use multiple requests and pass items in between them in scrapy python
What you are looking here is called request chaining. Your problem is - yield one item from several requests. A solution to this is to chain requests while carrying your item in requests meta attribute.
Example:
def parse(self, response):
item = MyItem()
item['name'] = response.xpath("//div[#id='name']/text()").extract()
more_page = # some page that offers more details
# go to more page and take your item with you.
yield Request(more_page,
self.parse_more,
meta={'item':item})
def parse_more(self, response):
# get your item from the meta
item = response.meta['item']
# fill it in with more data and yield!
item['last_name'] = response.xpath("//div[#id='lastname']/text()").extract()
yield item

Scrapy: Unsuccessful iterating over a list and pagination

My goal is to extract all 25 rows ( 6 items per row) per page then iterate over each of the 40 pages.
Currently, my spider extracts the first row from page 1-3 (see CSV output image).
I assumed the, list_iterator() function would iterate over each row; however, there appears to be an error in either my rules or list_iterator() function that is not allowing all rows per page to be scrapped.
Any assistance or advice is greatly appreciated!
propub_spider.py:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from propub.items import PropubItem
from scrapy.http import Request
class propubSpider(CrawlSpider):
name = 'prop$'
allowed_domains = ['https://projects.propublica.org']
max_pages = 40
start_urls = [
'https://projects.propublica.org/docdollars/search?state%5Bid%5D=33',
'https://projects.propublica.org/docdollars/search?page=2&state%5Bid%5D=33',
'https://projects.propublica.org/docdollars/search?page=3&state%5Bid%5D=33']
rules = (Rule(SgmlLinkExtractor(allow=('\\search?page=\\d')), 'parse_start_url', follow=True),)
def list_iterator(self):
for i in range(self.max_pages):
yield Request('https://projects.propublica.org/docdollars/search?page=d' % i, callback=self.parse)
def parse(self, response):
for sel in response.xpath('//*[#id="payments_list"]/tbody'):
item = PropubItem()
item['payee'] = sel.xpath('tr[1]/td[1]/a[2]/text()').extract()
item['link'] = sel.xpath('tr[1]/td[1]/a[1]/#href').extract()
item['city'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['state'] = sel.xpath('tr[1]/td[3]/text()').extract()
item['company'] = sel.xpath('tr[1]/td[4]').extract()
item['amount'] = sel.xpath('tr[1]/td[7]/span/text()').extract()
yield item
pipelines.py:
import csv
class PropubPipeline(object):
def __init__(self):
self.myCSV = csv.writer(open('C:\Users\Desktop\propub.csv', 'wb'))
self.myCSV.writerow(['payee', 'link', 'city', 'state', 'company', 'amount'])
def process_item(self, item, spider):
self.myCSV.writerow([item['payee'][0].encode('utf-8'),
item['link'][0].encode('utf-8'),
item['city'][0].encode('utf-8'),
item['state'][0].encode('utf-8'),
item['company'][0].encode('utf-8'),
item['amount'][0].encode('utf-8')])
return item
items.py:
import scrapy
from scrapy.item import Item, Field
class PropubItem(scrapy.Item):
payee = scrapy.Field()
link = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
company = scrapy.Field()
amount = scrapy.Field()
pass
CSV output:
Multiple things need to be fixed:
use start_requests() method instead of list_iterator()
there is a missing % here:
yield Request('https://projects.propublica.org/docdollars/search?page=%d' % i, callback=self.parse)
# HERE^
you don't need CrawlSpider since you are providing the pagination links via start_requests() - use regular scrapy.Spider
it would more reliable if XPath expressions would match the cells by class attributes
Fixed version:
import scrapy
from propub.items import PropubItem
class propubSpider(scrapy.Spider):
name = 'prop$'
allowed_domains = ['projects.propublica.org']
max_pages = 40
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('https://projects.propublica.org/docdollars/search?page=%d' % i, callback=self.parse)
def parse(self, response):
for sel in response.xpath('//*[#id="payments_list"]//tr[#data-payment-id]'):
item = PropubItem()
item['payee'] = sel.xpath('td[#class="name_and_payee"]/a[last()]/text()').extract()
item['link'] = sel.xpath('td[#class="name_and_payee"]/a[1]/#href').extract()
item['city'] = sel.xpath('td[#class="city"]/text()').extract()
item['state'] = sel.xpath('td[#class="state"]/text()').extract()
item['company'] = sel.xpath('td[#class="company"]/text()').extract()
item['amount'] = sel.xpath('td[#class="amount"]/text()').extract()
yield item

Adding Scrapy request URL into Parsed Array

I'm using the below Scrapy code, which is fully functioning, to scrape data from from a website. The scraper inputs a text list of product IDs, which are generated into a URL on line 10. How can I add the current start_url as an additional element to my item array?
from scrapy.spider import Spider
from scrapy.selector import Selector
from site_scraper.items import SiteScraperItem
class MySpider(Spider):
name = "product"
allowed_domains = ["site.com"]
url_list = open("productIDs.txt")
base_url = "http://www.site.com/p/"
start_urls = [base_url + url.strip() for url in url_list.readlines()]
url_list.close()
def parse(self, response):
hxs = Selector(response)
titles = hxs.xpath("//span[#itemprop='name']")
items = []
item = SiteScraperItem()
item ["Classification"] = titles.xpath("//div[#class='productSoldMessage']/text()").extract()[1:]
item ["Price"] = titles.xpath("//span[#class='pReg']/text()").extract()
item ["Name"] = titles.xpath("//span[#itemprop='name']/text()").extract()
try:
titles.xpath("//link[#itemprop='availability']/#href").extract()[0] == 'http://schema.org/InStock'
item ["Availability"] = 'In Stock'
except:
item ["Availability"] = 'Out of Stock'
if len(item ["Name"]) == 0:
item ["OnlineStatus"] = 'Offline'
item ["Availability"] = ''
else:
item ["OnlineStatus"] = 'Online'
items.append(item)
return items
I am exporting this data to CSV using the below command line code and would like the URL to be an additional value in my CSV file.
scrapy crawl product -o items.csv -t csv
Thanks in advance for your help!
Add a new Field to your SiteScraperItem Item class and set it to response.url in the parse() method.

Categories

Resources