Adding Scrapy request URL into Parsed Array - python

I'm using the below Scrapy code, which is fully functioning, to scrape data from from a website. The scraper inputs a text list of product IDs, which are generated into a URL on line 10. How can I add the current start_url as an additional element to my item array?
from scrapy.spider import Spider
from scrapy.selector import Selector
from site_scraper.items import SiteScraperItem
class MySpider(Spider):
name = "product"
allowed_domains = ["site.com"]
url_list = open("productIDs.txt")
base_url = "http://www.site.com/p/"
start_urls = [base_url + url.strip() for url in url_list.readlines()]
url_list.close()
def parse(self, response):
hxs = Selector(response)
titles = hxs.xpath("//span[#itemprop='name']")
items = []
item = SiteScraperItem()
item ["Classification"] = titles.xpath("//div[#class='productSoldMessage']/text()").extract()[1:]
item ["Price"] = titles.xpath("//span[#class='pReg']/text()").extract()
item ["Name"] = titles.xpath("//span[#itemprop='name']/text()").extract()
try:
titles.xpath("//link[#itemprop='availability']/#href").extract()[0] == 'http://schema.org/InStock'
item ["Availability"] = 'In Stock'
except:
item ["Availability"] = 'Out of Stock'
if len(item ["Name"]) == 0:
item ["OnlineStatus"] = 'Offline'
item ["Availability"] = ''
else:
item ["OnlineStatus"] = 'Online'
items.append(item)
return items
I am exporting this data to CSV using the below command line code and would like the URL to be an additional value in my CSV file.
scrapy crawl product -o items.csv -t csv
Thanks in advance for your help!

Add a new Field to your SiteScraperItem Item class and set it to response.url in the parse() method.

Related

Scrapy only get the data of last page

I'm using python 3.6 and scrapy 2.4.1, and I wrote a spider to scrape about 5 pages, then use xlsxwriter to save to excel, however this scarpy only get last page data, can't figure out why, here is my spider code
import scrapy
from scrapy.selector import Selector
from ebay.items import EbayItem
class EbaySpiderSpider(scrapy.Spider):
name = 'ebay_spider'
allowed_domains = ['www.ebay.com.au']
start_urls = ['https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1']
def parse(self, response):
item_price_extract = []
item_title = []
item_title_list = response.xpath('//h3[#class="lvtitle"]/a')
item_href = response.xpath('//h3[#class="lvtitle"]/a/#href').getall()
for title in item_title_list:
item_title_text = title.xpath('string(.)').get()
item_title.append(item_title_text)
item_price = response.xpath('//li[#class="lvprice prc"]//span[#class="bold"]')
for i in range(len(item_price)):
item_price_text = item_price[i].xpath('string(.)').get()
item_price_extract.append(item_price_text.strip())
item_info = EbayItem(title=item_title, price=item_price_extract, item_href=item_href)
yield item_info
next_url_href = response.xpath('//a[#class="gspr next"]/#href').get()
if not next_url_href:
return
else:
yield scrapy.Request(next_url_href, callback=self.parse)
and pipeline code
import xlsxwriter
class EbayPipeline:
def open_spider(self, spider):
pass
def process_item(self, item, spider):
col_num = 0
workbook = xlsxwriter.Workbook(r'C:\Users\Clevo\Desktop\store_spider.xlsx')
worksheet = workbook.add_worksheet()
item_source = dict(item)
# print(item_source)
for key, values in item_source.items():
worksheet.write(0, col_num, key)
worksheet.write_column(1, col_num, values)
col_num += 1
workbook.close()
return item
someone know the reason why? it seems everything is ok, but I can only get last page data
by the way, is there anyway to transfer data to another function? I want to scrapy page detail and transfer the data to process_item function and yield them together
Better scraped every pages first and get data on its product page.
class EbaySpiderSpider(scrapy.Spider):
name = "ebay_spider"
def start_requests(self):
base_url = 'https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs='
for i in range(1,6):
page = base_url + str(i)#i will be the page number and add to base_url
yield scrapy.Request(url=page , callback=self.parse)
# scraped all product links first and yield to parse_contents
def parse(self, response):
links = response.xpath('//h3[#class="lvtitle"]/a/#href').extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_contents)
#scraped desired data on product page
def parse_contents(self, response):
product_url = response.url
title = response.xpath('//h1/text()').extract()[0]
price = response.xpath('//span[#itemprop="price"]/text()').extract()[0]
item = EbayItem()
item['product_title'] = title
item['product_price'] = price
yield item ### to items.py
items.py, make sure that the item keys are equal to scrapy.Field()
class EbayITem(scrapy.Item):
product_title = scrapy.Field()
product_price = scrapy.Field()
pipelines.py
import xlsxwriter
class EbayPipeline:
def process_item(self, item, spider):
title = item['product_title']
price = item['product_price']
#process your worksheet here
Working version of your code
import scrapy
from scrapy.selector import Selector
from ebay.items import EbayItem
class EbaySpiderSpider(scrapy.Spider):
name = 'ebay_spider'
allowed_domains = ['ebay.com.au']
start_urls = ['https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1']
def parse(self, response):
item_price_extract = []
item_title = []
item_title_list = response.xpath('//h3[#class="lvtitle"]/a')
item_href = response.xpath('//h3[#class="lvtitle"]/a/#href').getall()
for title in item_title_list:
item_title_text = title.xpath('string(.)').get()
item_title.append(item_title_text)
item_price = response.xpath('//li[#class="lvprice prc"]//span[#class="bold"]')
for i in range(len(item_price)):
item_price_text = item_price[i].xpath('string(.)').get()
item_price_extract.append(item_price_text.strip())
item_info = EbayItem(title=item_title, price=item_price_extract, item_href=item_href)
yield item_info
next_url_href = response.xpath('//a[#class="gspr next"]/#href').get()
if next_url_href is not None:
next_url_href = response.urljoin(next_url_href)
yield scrapy.Request(next_url_href, callback=self.parse)
You will have to set ROBOTSTXT_OBEY=False in settings.py (which is not a good practice) or else it your spider won't scrape data and will give message:
[scrapy.downloadermiddlewares.robotstxt] DEBUG: Forbidden by robots.txt: <GET https://www.ebay.com.au/sch/auplazaplace/m.html?_nkw=&_armrs=1>

Scrapy returns all value in a single cell

i'm trying to scrape this site using scrapy but returns all the value in a
single cell, i except each value in a different row.
example:
milage: 25
milage: 377
milage: 247433
milage: 464130
but i'm getting the data like this
example:
milage:[u'25',
u'377',
u'247433',
u'399109',
u'464130',
u'399631',
u'435238',
u'285000',
u'287470',
u'280000']
here is my code
import scrapy
from ..items import ExampleItem
from scrapy.selector import HtmlXPathSelector
url = 'https://example.com'
class Example(scrapy.Spider):
name = 'example'
allowed_domains = ['www.example.com']
start_urls = [url]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item_selector = hxs.select('//div[#class="listing_format card5 relative"]')
for fields in item_selector:
item = ExampleItem()
item ['Mileage'] = fields.select('//li[strong="Mileage"]/span/text()').extract()
yield item
You didn't show your site but may be you need relative XPath:
item ['Mileage'] = fields.select('.//li[strong="Mileage"]/span/text()').extract_first()
It sounds like you need to iterate over your milages.
for fields in item_selector:
milages = fields.select('//li[strong="Mileage"]/span/text()').extract()
for milage in milages:
item = CommercialtrucktraderItem()
item ['Mileage'] = milage
yield item
Also consider making your fields.select('//li[strong="Mileage"]/span/text()').extract() more specific?

Scrapy Spider not following Request callback using yield

I'm new to scrapy and I cant get my spider to enter parse_votes in code bellow, even though I set it as callback. The others parse methods are working fine, I don't get any ERROR and checked the 'link' variable which has the correct info. HELP?
EDIT - Full code
class DeputadosSpider(scrapy.Spider):
name = "deputies"
allowed_domains = ["camara.leg.br"]
start_urls = ["http://www2.camara.leg.br/deputados/pesquisa"]
def parse(self, response):
sel = Selector(response)
sel_options = sel.xpath('//*[#id="deputado"]/option[position()>1]')
iteration = 1
# get deputies pages
for sel_option in sel_options:
item = DeputiesInfo()
item["war_name"] = sel_option.xpath("text()").extract()
item["link_id"] = sel_option.extract().partition('?')[-1].rpartition('"')[0]
item["page_link"] = 'http://www.camara.leg.br/internet/Deputado/dep_Detalhe.asp?id=' + item["link_id"]
item["id"] = iteration
iteration += 1
# go scrap their page
yield scrapy.Request(item["page_link"], callback=self.parse_deputy, meta={'item': item})
def parse_deputy(self, response):
item = response.meta['item']
sel = Selector(response)
info = sel.xpath('//div[#id="content"]/div/div[1]/ul/li')
# end to fill the data
item["full_name"] = info.xpath("text()").extract_first()
item["party"] = info.xpath("text()").extract()[2].partition('/')[0]
item["uf"] = info.xpath("text()").extract()[2].partition('/')[-1].rpartition('/')[0]
item["legislatures"] = info.xpath("text()").extract()[5]
item["picture"] = sel.xpath('//div[#id="content"]/div/div[1]//img[1]/#src').extract()
# save data to json file
file = open('deputies_info.json', 'a')
line = json.dumps(dict(item)) + ",\n"
file.write(line)
# colect votes info
get_years = sel.xpath('//*[#id="my-informations"]/div[3]/div/ul/li[1]/a[position()<4]')
for get_year in get_years:
vote = VotesInfo()
vote["deputy_id"] = item["id"]
vote["year"] = get_year.xpath("text()").extract_first()
link = get_year.xpath("#href").extract_first()
print(vote["year"])
print(link)
# go to voting pages
yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote})
def parse_votes(self, response):
#vote = response.meta['vote']
print('YYYYYYYYYYYYYUHUL IM IN!!')
Your problem is allowed_domains, because the link you are trying to request in parse_deputy is for example: http://www.camara.gov.br/internet/deputado/RelVotacoes.asp?nuLegislatura=55&nuMatricula=410&dtInicio=01/01/2016&dtFim=30/12/2016
and its domain is camara.gov.br so add it to allowed_domains.
allowed_domains = ["camara.leg.br", "camara.gov.br"]
PS: I ran your code commentingallowed_domains, and parse_votes works perfectly.
I ran your spider and found why it nerver enters parse_votes.
I checked the link in yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote}) and found out that it is not in the same domain
The link belongs to the camara.gov.br domain, which does not belong to the allowed_domains = ["camara.leg.br"]
So you need to add this domain to the allowed_domains list.
allowed_domains = ["camara.leg.br", "camara.gov.br"]

how to scrape Tripadvisor dynamically using scrapy and python

I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()

Scrapy spider not saving to csv

I have a spider which reads a list of urls from a text file and saves the title and body text from each. The crawl works but the data does not get saved to csv. I set up a pipeline to save to csv because the normal -o option did not work for me. I did change the settings.py for piepline. Any help with this would be greatly appreciated.
The code is as follows:
Items.py
from scrapy.item import Item, Field
class PrivacyItem(Item):
# define the fields for your item here like:
# name = Field()
title = Field()
desc = Field()
PrivacySpider.py
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from privacy.items import PrivacyItem
class PrivacySpider(CrawlSpider):
name = "privacy"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def parse(self, response):
hxs = HtmlXPathSelector(response)
items =[]
for url in start_urls:
item = PrivacyItem()
item['desc'] = hxs.select('//body//p/text()').extract()
item['title'] = hxs.select('//title/text()').extract()
items.append(item)
return items
Pipelines.py
import csv
class CSVWriterPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('CONTENT.csv', 'wb'))
def process_item(self, item, spider):
self.csvwriter.writerow([item['title'][0], item['desc'][0]])
return item
you don't have to loop on start_urls, scrapy is doing something like this:
for url in spider.start_urls:
request url and call spider.parse() with its response
so your parse function should look something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = PrivacyItem()
item['desc'] = hxs.select('//body//p/text()').extract()
item['title'] = hxs.select('//title/text()').extract()
return item
also try to avoid returning lists as item fields, do something like: hxs.select('..').extract()[0]

Categories

Resources