Scrapy Parse table and skip table rows using their class names

Scrapy Parse table and skip table rows using their class names - python

import scrapy
from mystocks.items import MystocksItem
from scrapy.selector import Selector
import datetime
class Synergyspider(scrapy.Spider):
name = "synergyspider"
allowed_domains = ["http://live.mystocks.co.ke/price_list/"]
def parse(self, response):
sel = Selector(response)
head = sel.xpath('//*[#id="main"]/h2')
rows_r1 = sel.xpath('//tr[#class = "row r1"]')
items = []
for row in rows_r1:
item = MystocksItem()
item['date'] = head.xpath('text()').extract()[0]
item['code'] = rows_r1.xpath('./td[1]/a/text()').extract()[0]
item['name'] = rows_r1.xpath('./td[2]/text()').extract()[0]
item['last12_low'] = rows_r1.xpath('./td[3]/text()').extract()[0]
item['last12_high'] = rows_r1.xpath('./td[4]/text()').extract()[0]
#item['day_low'] = rows_r1.xpath('./td[5]/text()').extractf()[0]
item['day_high'] = rows_r1.xpath('./td[6]/text()').extract()[0]
item['price'] = rows_r1.xpath('./td[7]/text()').extract()[0]
item['previous'] = rows_r1.xpath('./td[8]/text()').extract()[0]
item['change'] = rows_r1.xpath('./td[9]/text()').extract()[0]
item['percentChange'] = rows_r1.xpath('./td[10]/text()').extract()[0]
item['volume'] = rows_r1.xpath('./td[12]/text()').extract()[0]
item['adjustedPrice'] = rows_r1.xpath('./td[13]/text()').extract()[0]
items.append(item)
return items
The url I was trying to parse is http://live.mystocks.co.ke/price_list/20140402
I want to iterate over the rows with classes "row r1" and "row r0" but skip the rows with class "row".

found out i was loping on the wrong variable, loop on row not row_r1

Related

when I go to scrapy to convert my web scraping data to csv! No matter how many rows I have. In just one row, the data of all rows is being inserted

import scrapy
from ..items import AmazondawinItem
class AmazonspiderSpider(scrapy.Spider):
name = 'amazon'
pagenumber = 3
allowed_domains = ['amazon.com']
start_urls = [
'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
]
def parse(self, response):
items = AmazondawinItem()
name = response.css('.a-size-medium::text').extract()
try:
old_price = response.css('.a-spacing-top-micro .a-text-price span::text').extract()
except:
old_price = None
price = response.css('.a-spacing-top-micro .a-price-whole::text').extract()
try:
review = response.css('.s-link-style .s-underline-text::text').extract()
except:
review = None
imagelink = response.css('.s-image::attr(src)').extract()
items['name'] = name
items['old_price'] = old_price
items['price'] = price
items['review'] = review
items['imagelink'] = imagelink
# description =
# ram =
# brand =
# cpu_model =
yield items
Here when I go to scrapy to convert my web scraping data to csv file or any file! No matter how many rows I have. In just one row, the data of all rows is being inserted. or import. Suppose, I have 200 rows in 1 column. But I am getting 200 rows of data in one row.

It's because you're yielding all the items instead of yielding each item separately.
A not so nice solution:
import scrapy
# from ..items import AmazondawinItem
class AmazonspiderSpider(scrapy.Spider):
name = 'amazon'
pagenumber = 3
allowed_domains = ['amazon.com']
start_urls = [
'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
]
def parse(self, response):
# items = AmazondawinItem()
name = response.css('.a-size-medium::text').extract()
try:
old_price = response.css('.a-spacing-top-micro .a-text-price span::text').extract()
except:
old_price = None
price = response.css('.a-spacing-top-micro .a-price-whole::text').extract()
try:
review = response.css('.s-link-style .s-underline-text::text').extract()
except:
review = None
imagelink = response.css('.s-image::attr(src)').extract()
# items = dict()
# items['name'] = name
# items['old_price'] = old_price
# items['price'] = price
# items['review'] = review
# items['imagelink'] = imagelink
items = dict()
for (items['name'], items['old_price'], items['price'], items['review'], items['imagelink']) in zip(name, old_price, price, review, imagelink):
yield items
# description =
# ram =
# brand =
# cpu_model =
# yield items
A better solution:
Remove the try except, get() function will return none if no value was found. It's better not to use it in spiders anyway.
Get the items one by one.
Just replace the dict part with your item, just make sure it's inside the loop.
import scrapy
# from ..items import AmazondawinItem
class AmazonspiderSpider(scrapy.Spider):
name = 'amazon'
pagenumber = 3
allowed_domains = ['amazon.com']
start_urls = [
'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
]
def parse(self, response):
for row in response.css('div.s-result-list div.s-result-item.s-asin'):
# items = AmazondawinItem()
items = dict()
items['name'] = row.css('.a-size-medium::text').get()
items['old_price'] = row.css('.a-spacing-top-micro .a-text-price span::text').get()
items['price'] = response.css('.a-spacing-top-micro .a-price-whole::text').get()
items['review'] = row.css('.s-link-style .s-underline-text::text').get()
items['imagelink'] = row.css('.s-image::attr(src)').get()
yield items
# description =
# ram =
# brand =
# cpu_model =
# yield items

Scrapy only get the data of last page

I'm using python 3.6 and scrapy 2.4.1, and I wrote a spider to scrape about 5 pages, then use xlsxwriter to save to excel, however this scarpy only get last page data, can't figure out why, here is my spider code
import scrapy
from scrapy.selector import Selector
from ebay.items import EbayItem
class EbaySpiderSpider(scrapy.Spider):
name = 'ebay_spider'
allowed_domains = ['www.ebay.com.au']
start_urls = ['https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1']
def parse(self, response):
item_price_extract = []
item_title = []
item_title_list = response.xpath('//h3[#class="lvtitle"]/a')
item_href = response.xpath('//h3[#class="lvtitle"]/a/#href').getall()
for title in item_title_list:
item_title_text = title.xpath('string(.)').get()
item_title.append(item_title_text)
item_price = response.xpath('//li[#class="lvprice prc"]//span[#class="bold"]')
for i in range(len(item_price)):
item_price_text = item_price[i].xpath('string(.)').get()
item_price_extract.append(item_price_text.strip())
item_info = EbayItem(title=item_title, price=item_price_extract, item_href=item_href)
yield item_info
next_url_href = response.xpath('//a[#class="gspr next"]/#href').get()
if not next_url_href:
return
else:
yield scrapy.Request(next_url_href, callback=self.parse)
and pipeline code
import xlsxwriter
class EbayPipeline:
def open_spider(self, spider):
pass
def process_item(self, item, spider):
col_num = 0
workbook = xlsxwriter.Workbook(r'C:\Users\Clevo\Desktop\store_spider.xlsx')
worksheet = workbook.add_worksheet()
item_source = dict(item)
# print(item_source)
for key, values in item_source.items():
worksheet.write(0, col_num, key)
worksheet.write_column(1, col_num, values)
col_num += 1
workbook.close()
return item
someone know the reason why? it seems everything is ok, but I can only get last page data
by the way, is there anyway to transfer data to another function? I want to scrapy page detail and transfer the data to process_item function and yield them together

Better scraped every pages first and get data on its product page.
class EbaySpiderSpider(scrapy.Spider):
name = "ebay_spider"
def start_requests(self):
base_url = 'https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs='
for i in range(1,6):
page = base_url + str(i)#i will be the page number and add to base_url
yield scrapy.Request(url=page , callback=self.parse)
# scraped all product links first and yield to parse_contents
def parse(self, response):
links = response.xpath('//h3[#class="lvtitle"]/a/#href').extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_contents)
#scraped desired data on product page
def parse_contents(self, response):
product_url = response.url
title = response.xpath('//h1/text()').extract()[0]
price = response.xpath('//span[#itemprop="price"]/text()').extract()[0]
item = EbayItem()
item['product_title'] = title
item['product_price'] = price
yield item ### to items.py
items.py, make sure that the item keys are equal to scrapy.Field()
class EbayITem(scrapy.Item):
product_title = scrapy.Field()
product_price = scrapy.Field()
pipelines.py
import xlsxwriter
class EbayPipeline:
def process_item(self, item, spider):
title = item['product_title']
price = item['product_price']
#process your worksheet here

Working version of your code
import scrapy
from scrapy.selector import Selector
from ebay.items import EbayItem
class EbaySpiderSpider(scrapy.Spider):
name = 'ebay_spider'
allowed_domains = ['ebay.com.au']
start_urls = ['https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1']
def parse(self, response):
item_price_extract = []
item_title = []
item_title_list = response.xpath('//h3[#class="lvtitle"]/a')
item_href = response.xpath('//h3[#class="lvtitle"]/a/#href').getall()
for title in item_title_list:
item_title_text = title.xpath('string(.)').get()
item_title.append(item_title_text)
item_price = response.xpath('//li[#class="lvprice prc"]//span[#class="bold"]')
for i in range(len(item_price)):
item_price_text = item_price[i].xpath('string(.)').get()
item_price_extract.append(item_price_text.strip())
item_info = EbayItem(title=item_title, price=item_price_extract, item_href=item_href)
yield item_info
next_url_href = response.xpath('//a[#class="gspr next"]/#href').get()
if next_url_href is not None:
next_url_href = response.urljoin(next_url_href)
yield scrapy.Request(next_url_href, callback=self.parse)
You will have to set ROBOTSTXT_OBEY=False in settings.py (which is not a good practice) or else it your spider won't scrape data and will give message:
[scrapy.downloadermiddlewares.robotstxt] DEBUG: Forbidden by robots.txt: <GET https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1>

Scrapy + python: csv file not exported in the correct order

I'm creating a csv file with my spider but it gives me a weird order of data:
My code:
class GoodmanSpider(scrapy.Spider):
name = "goodmans"
start_urls = ['http://www.goodmans.net/d/1706/brands.htm']
def parse(self, response):
items = TutorialItem()
all_data = response.css('.SubDepartments')
for data in all_data:
category = data.css('.SubDepartments a::text').extract()
category_url = data.css('.SubDepartments a::attr(href)').extract()
items['category'] = category
items['category_url'] = category_url
yield items
My items.py file
The output I get:
The output I want, more or less:

You have stacked all your items in a single one.
Each item should be a dict of single value for each key, while you're having a list.
Try something like:
for cat, url in zip(category, category_url):
item = dict(category=cat, category_url=url)
yield item

This is the code correction, based on Michael's answer. Works perfectly
import scrapy
from ..items import TutorialItem
import pandas as pd
class GoodmanSpider(scrapy.Spider):
name = "goodmans"
start_urls = ['http://www.goodmans.net/d/1706/brands.htm']
def parse(self, response):
items = TutorialItem()
all_data = response.css('.SubDepartments')
for data in all_data:
category = data.css('.SubDepartments a::text').extract()
category_url = data.css('.SubDepartments a::attr(href)').extract()
items['category'] = category
items['category_url'] = category_url
for cat, url in zip(category, category_url):
item = dict(category=cat, category_url=url)
yield item

Scrapy merge subsite-item with site-item

Im trying to scrape details from a subsite and merge with the details scraped with site. I've been researching through stackoverflow, as well as documentation. However, I still cant get my code to work. It seems that my function to extract additional details from the subsite does not work. If anyone could take a look I would be very grateful.
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, category FROM dbo.StageItem")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
InfoID = row[0]
category = row[1]
yield self.make_requests_from_url(url+InfoID, InfoID, category, self.parse)
def make_requests_from_url(self, url, InfoID, category, callback):
request = Request(url, callback)
request.meta['InfoID'] = InfoID
request.meta['category'] = category
return request
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = response.meta['InfoID']
category = response.meta['category']
for info in infodata:
item = infoItem()
item_cur, item_hist = InfoItemSubSite()
# Stem Details
item['id'] = InfoID
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item_cur['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item_cur['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()
# Extract additional information about item_cur from refering site
# This part does not work
if item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract():
url = 'http://www.nevermind.com/info/sub/' + item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()[0]
request = Request(url, housingtype, self.parse_item_sub)
request.meta['category'] = category
yield self.parse_item_sub(url, category)
item_his['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_his['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
item_his['field7'] = info.xpath('tr[7]/td[2]/p/b/#href').extract()
item['subsite_dic'] = [dict(item_cur), dict(item_his)]
itemPool.append(item)
yield item
pass
# Function to extract additional info from the subsite, and return it to the original item.
def parse_item_sub(self, response, category):
hxs = Selector(response)
subsite = hxs.xpath('div/div[2]') # input base path
category = response.meta['category']
for i in subsite:
item = InfoItemSubSite()
if (category == 'first'):
item['subsite_field1'] = i.xpath('/td[2]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[2]/text()').extract()
item['subsite_field3'] = i.xpath('/div[5]/a[1]/#href').extract()
else:
item['subsite_field1'] = i.xpath('/tr[10]/td[3]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[1]/text()').extract()
item['subsite_field3'] = i.xpath('/div[7]/a[1]/#href').extract()
return item
pass
I've been looking at these examples together with a lot of other examples (stackoverflow is great for that!), as well as scrapy documentation, but still unable to understand how I get details send from one function and merged with the scraped items from the original function.
how do i merge results from target page to current page in scrapy?
How can i use multiple requests and pass items in between them in scrapy python

What you are looking here is called request chaining. Your problem is - yield one item from several requests. A solution to this is to chain requests while carrying your item in requests meta attribute.
Example:
def parse(self, response):
item = MyItem()
item['name'] = response.xpath("//div[#id='name']/text()").extract()
more_page = # some page that offers more details
# go to more page and take your item with you.
yield Request(more_page,
self.parse_more,
meta={'item':item})
def parse_more(self, response):
# get your item from the meta
item = response.meta['item']
# fill it in with more data and yield!
item['last_name'] = response.xpath("//div[#id='lastname']/text()").extract()
yield item

Scrapy: Unsuccessful iterating over a list and pagination

My goal is to extract all 25 rows ( 6 items per row) per page then iterate over each of the 40 pages.
Currently, my spider extracts the first row from page 1-3 (see CSV output image).
I assumed the, list_iterator() function would iterate over each row; however, there appears to be an error in either my rules or list_iterator() function that is not allowing all rows per page to be scrapped.
Any assistance or advice is greatly appreciated!
propub_spider.py:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from propub.items import PropubItem
from scrapy.http import Request
class propubSpider(CrawlSpider):
name = 'prop$'
allowed_domains = ['https://projects.propublica.org']
max_pages = 40
start_urls = [
'https://projects.propublica.org/docdollars/search?state%5Bid%5D=33',
'https://projects.propublica.org/docdollars/search?page=2&state%5Bid%5D=33',
'https://projects.propublica.org/docdollars/search?page=3&state%5Bid%5D=33']
rules = (Rule(SgmlLinkExtractor(allow=('\\search?page=\\d')), 'parse_start_url', follow=True),)
def list_iterator(self):
for i in range(self.max_pages):
yield Request('https://projects.propublica.org/docdollars/search?page=d' % i, callback=self.parse)
def parse(self, response):
for sel in response.xpath('//*[#id="payments_list"]/tbody'):
item = PropubItem()
item['payee'] = sel.xpath('tr[1]/td[1]/a[2]/text()').extract()
item['link'] = sel.xpath('tr[1]/td[1]/a[1]/#href').extract()
item['city'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['state'] = sel.xpath('tr[1]/td[3]/text()').extract()
item['company'] = sel.xpath('tr[1]/td[4]').extract()
item['amount'] = sel.xpath('tr[1]/td[7]/span/text()').extract()
yield item
pipelines.py:
import csv
class PropubPipeline(object):
def __init__(self):
self.myCSV = csv.writer(open('C:\Users\Desktop\propub.csv', 'wb'))
self.myCSV.writerow(['payee', 'link', 'city', 'state', 'company', 'amount'])
def process_item(self, item, spider):
self.myCSV.writerow([item['payee'][0].encode('utf-8'),
item['link'][0].encode('utf-8'),
item['city'][0].encode('utf-8'),
item['state'][0].encode('utf-8'),
item['company'][0].encode('utf-8'),
item['amount'][0].encode('utf-8')])
return item
items.py:
import scrapy
from scrapy.item import Item, Field
class PropubItem(scrapy.Item):
payee = scrapy.Field()
link = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
company = scrapy.Field()
amount = scrapy.Field()
pass
CSV output:

Multiple things need to be fixed:
use start_requests() method instead of list_iterator()
there is a missing % here:
yield Request('https://projects.propublica.org/docdollars/search?page=%d' % i, callback=self.parse)
# HERE^
you don't need CrawlSpider since you are providing the pagination links via start_requests() - use regular scrapy.Spider
it would more reliable if XPath expressions would match the cells by class attributes
Fixed version:
import scrapy
from propub.items import PropubItem
class propubSpider(scrapy.Spider):
name = 'prop$'
allowed_domains = ['projects.propublica.org']
max_pages = 40
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('https://projects.propublica.org/docdollars/search?page=%d' % i, callback=self.parse)
def parse(self, response):
for sel in response.xpath('//*[#id="payments_list"]//tr[#data-payment-id]'):
item = PropubItem()
item['payee'] = sel.xpath('td[#class="name_and_payee"]/a[last()]/text()').extract()
item['link'] = sel.xpath('td[#class="name_and_payee"]/a[1]/#href').extract()
item['city'] = sel.xpath('td[#class="city"]/text()').extract()
item['state'] = sel.xpath('td[#class="state"]/text()').extract()
item['company'] = sel.xpath('td[#class="company"]/text()').extract()
item['amount'] = sel.xpath('td[#class="amount"]/text()').extract()
yield item

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy Parse table and skip table rows using their class names - python

found out i was loping on the wrong variable, loop on row not row_r1

Related

when I go to scrapy to convert my web scraping data to csv! No matter how many rows I have. In just one row, the data of all rows is being inserted

Scrapy only get the data of last page

Scrapy + python: csv file not exported in the correct order

Scrapy merge subsite-item with site-item

Scrapy: Unsuccessful iterating over a list and pagination

Categories

Resources