Scrapy merge subsite-item with site-item

Scrapy merge subsite-item with site-item - python

Im trying to scrape details from a subsite and merge with the details scraped with site. I've been researching through stackoverflow, as well as documentation. However, I still cant get my code to work. It seems that my function to extract additional details from the subsite does not work. If anyone could take a look I would be very grateful.
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, category FROM dbo.StageItem")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
InfoID = row[0]
category = row[1]
yield self.make_requests_from_url(url+InfoID, InfoID, category, self.parse)
def make_requests_from_url(self, url, InfoID, category, callback):
request = Request(url, callback)
request.meta['InfoID'] = InfoID
request.meta['category'] = category
return request
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = response.meta['InfoID']
category = response.meta['category']
for info in infodata:
item = infoItem()
item_cur, item_hist = InfoItemSubSite()
# Stem Details
item['id'] = InfoID
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item_cur['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item_cur['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()
# Extract additional information about item_cur from refering site
# This part does not work
if item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract():
url = 'http://www.nevermind.com/info/sub/' + item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()[0]
request = Request(url, housingtype, self.parse_item_sub)
request.meta['category'] = category
yield self.parse_item_sub(url, category)
item_his['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_his['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
item_his['field7'] = info.xpath('tr[7]/td[2]/p/b/#href').extract()
item['subsite_dic'] = [dict(item_cur), dict(item_his)]
itemPool.append(item)
yield item
pass
# Function to extract additional info from the subsite, and return it to the original item.
def parse_item_sub(self, response, category):
hxs = Selector(response)
subsite = hxs.xpath('div/div[2]') # input base path
category = response.meta['category']
for i in subsite:
item = InfoItemSubSite()
if (category == 'first'):
item['subsite_field1'] = i.xpath('/td[2]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[2]/text()').extract()
item['subsite_field3'] = i.xpath('/div[5]/a[1]/#href').extract()
else:
item['subsite_field1'] = i.xpath('/tr[10]/td[3]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[1]/text()').extract()
item['subsite_field3'] = i.xpath('/div[7]/a[1]/#href').extract()
return item
pass
I've been looking at these examples together with a lot of other examples (stackoverflow is great for that!), as well as scrapy documentation, but still unable to understand how I get details send from one function and merged with the scraped items from the original function.
how do i merge results from target page to current page in scrapy?
How can i use multiple requests and pass items in between them in scrapy python

What you are looking here is called request chaining. Your problem is - yield one item from several requests. A solution to this is to chain requests while carrying your item in requests meta attribute.
Example:
def parse(self, response):
item = MyItem()
item['name'] = response.xpath("//div[#id='name']/text()").extract()
more_page = # some page that offers more details
# go to more page and take your item with you.
yield Request(more_page,
self.parse_more,
meta={'item':item})
def parse_more(self, response):
# get your item from the meta
item = response.meta['item']
# fill it in with more data and yield!
item['last_name'] = response.xpath("//div[#id='lastname']/text()").extract()
yield item

Related

Trying to add multiple yields into a single json file using Scrapy

I am trying to figure out if my scrapy tool is correctly hitting the product_link for the request callback - 'yield scrapy.Request(product_link, callback=self.parse_new_item)'
product_link should be 'https://www.antaira.com/products/10-100Mbps/LNX-500A'
but I have not been able to confirm if my program is jumping into the next step created so that I can retrieve the correct yield return. Thank you!
# Import the required libraries
import scrapy
# Import the Item class with fields
# mentioned int he items.py file
from ..items import AntairaItem
# Spider class name
class productJumper(scrapy.Spider):
# Name of the spider
name = 'productJumper'
# The domain to be scraped
allowed_domains = ['antaira.com']
# The URLs to be scraped from the domain
start_urls = ['https://www.antaira.com/products/10-100Mbps']
#target_url = ['https://www.antaira.com/products/10-100Mbps/LNX-500A']
# First Step: Find every div with the class 'product-container' and step into the links
def parse(self, response):
#product_link = response.urljoin(rel_product_link)
# creating items dictionary
items = AntairaItem()
rel_product_link = response.css('div.center767')
for url in rel_product_link:
rel_product_link = response.xpath('//div[#class="product-container"]//a/#href').get(),
product_link = response.urljoin('rel_product_link'),
items['rel_product_link'] = rel_product_link,
items['product_link'] = product_link
#yield items
# 2nd Step: Return a list of the all products-links that will be scrapped
#yield {
# take the first relative product link
# 'rel_product_link' : rel_product_link,
# 'product_link' : product_link,
#}
yield scrapy.Request(product_link, callback=self.parse_new_item)
# Final Step: Run through each product and Yield the results
def parse_new_item(self, response):
for product in response.css('main.products'):
name = product.css(('h1.product-name::text').strip(' \t\n\r')).get()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

You have a couple of issues:
scrapy items are essentially dictionaries and are therefore mutable. You need to create a unique item for each and every yield statement.
your second parse callback is referencing a variable items that it doesn't have access too because it was defined in your first parse callback.
In your urljoin method you are using a string literal instead of a variable for rel_product_link
In the example below I fixed those issues and made some additional notes
import scrapy
from ..items import AntairaItem
class ProductJumper(scrapy.Spider): # classes should be TitleCase
name = 'productJumper'
allowed_domains = ['antaira.com']
start_urls = ['https://www.antaira.com/products/10-100Mbps']
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css(('h1.product-name::text').get().strip()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

Scrapy, crawling a dynamic page with multiple pages

For an assignment I am trying to build a spider which is able to fetch data from the "www.kaercher.com" webshop. All the products in the webshop are being called by an AJAX call. In order to load in more products, a button named "show more products", has to be pressed. I managed to fetch the required data from the corresponding URL which is being called by the AJAX Call.
However, for my assignment, I am suppose to fetch all (all products/pages) of a certain product. I've been digging around but I can't find a solution. I suppose I am suppose to do something with "isTruncated = true", true indicates that more products can be loaded, false means that there are no more products. (FIXED)
When I manage to fetch the data from all the pages, I need to find a way to fetch all the data from a list of products (create a .csv file with multiple kaercher products, each product has a unique ID which can be seen in the URL, in this case the ID 20035386 is for the high pressure washer). (FIXED)
Links:
Webshop: https://www.kaercher.com/nl/webshop/hogedrukreinigers-resultaten.html
High pressure washer: https://www.kaercher.com/nl/webshop/hogedrukreinigers-resultaten.html
API Url (page1): https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL
OLD CODE
Spider file
import scrapy
from krc.items import KrcItem
import json
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def parse(self, response):
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"]
yield item
Items file
import scrapy
class KrcItem(scrapy.Item):
productid=scrapy.Field()
name=scrapy.Field()
description=scrapy.Field()
price=scrapy.Field()
pass
NEW CODE
EDIT: 15/08/2019
Thanks to #gangabass I managed to fetch data from all of the product pages. I also manages to fetch the data from different products which are listed in a keyword.csv file. This enables me to fetch data from a list of products. See below for the new code:
Spider file (.py)
import scrapy
from krc.items import KrcItem
import json
import os
import csv
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def start_requests(self):
"""Read keywords from keywords file amd construct the search URL"""
with open(os.path.join(os.path.dirname(__file__), "../resources/keywords.csv")) as search_keywords:
for keyword in csv.DictReader(search_keywords):
search_text=keyword["keyword"]
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(
search_text)
# The meta is used to send our search text into the parser as metadata
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text})
def parse(self, response):
current_page = response.meta.get("page", 1)
next_page = current_page + 1
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"].replace("\u20ac","").strip()
yield item
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page},
)
Items file (.py)
import scrapy
class KrcItem(scrapy.Item):
productid=scrapy.Field()
name=scrapy.Field()
description=scrapy.Field()
price=scrapy.Field()
producttype=scrapy.Field()
pass
keywords file (.csv)
keyword,keywordtype
20035386,Hogedrukreiniger
20072956,Floor Cleaner

You can use response.meta to send current page number between requests:
def parse(self, response):
current_page = response.meta.get("page", 1)
next_page = current_page + 1
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"]
yield item
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page},
)

Scrapy Spider not following Request callback using yield

I'm new to scrapy and I cant get my spider to enter parse_votes in code bellow, even though I set it as callback. The others parse methods are working fine, I don't get any ERROR and checked the 'link' variable which has the correct info. HELP?
EDIT - Full code
class DeputadosSpider(scrapy.Spider):
name = "deputies"
allowed_domains = ["camara.leg.br"]
start_urls = ["http://www2.camara.leg.br/deputados/pesquisa"]
def parse(self, response):
sel = Selector(response)
sel_options = sel.xpath('//*[#id="deputado"]/option[position()>1]')
iteration = 1
# get deputies pages
for sel_option in sel_options:
item = DeputiesInfo()
item["war_name"] = sel_option.xpath("text()").extract()
item["link_id"] = sel_option.extract().partition('?')[-1].rpartition('"')[0]
item["page_link"] = 'http://www.camara.leg.br/internet/Deputado/dep_Detalhe.asp?id=' + item["link_id"]
item["id"] = iteration
iteration += 1
# go scrap their page
yield scrapy.Request(item["page_link"], callback=self.parse_deputy, meta={'item': item})
def parse_deputy(self, response):
item = response.meta['item']
sel = Selector(response)
info = sel.xpath('//div[#id="content"]/div/div[1]/ul/li')
# end to fill the data
item["full_name"] = info.xpath("text()").extract_first()
item["party"] = info.xpath("text()").extract()[2].partition('/')[0]
item["uf"] = info.xpath("text()").extract()[2].partition('/')[-1].rpartition('/')[0]
item["legislatures"] = info.xpath("text()").extract()[5]
item["picture"] = sel.xpath('//div[#id="content"]/div/div[1]//img[1]/#src').extract()
# save data to json file
file = open('deputies_info.json', 'a')
line = json.dumps(dict(item)) + ",\n"
file.write(line)
# colect votes info
get_years = sel.xpath('//*[#id="my-informations"]/div[3]/div/ul/li[1]/a[position()<4]')
for get_year in get_years:
vote = VotesInfo()
vote["deputy_id"] = item["id"]
vote["year"] = get_year.xpath("text()").extract_first()
link = get_year.xpath("#href").extract_first()
print(vote["year"])
print(link)
# go to voting pages
yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote})
def parse_votes(self, response):
#vote = response.meta['vote']
print('YYYYYYYYYYYYYUHUL IM IN!!')

Your problem is allowed_domains, because the link you are trying to request in parse_deputy is for example: http://www.camara.gov.br/internet/deputado/RelVotacoes.asp?nuLegislatura=55&nuMatricula=410&dtInicio=01/01/2016&dtFim=30/12/2016
and its domain is camara.gov.br so add it to allowed_domains.
allowed_domains = ["camara.leg.br", "camara.gov.br"]
PS: I ran your code commentingallowed_domains, and parse_votes works perfectly.

I ran your spider and found why it nerver enters parse_votes.
I checked the link in yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote}) and found out that it is not in the same domain
The link belongs to the camara.gov.br domain, which does not belong to the allowed_domains = ["camara.leg.br"]
So you need to add this domain to the allowed_domains list.
allowed_domains = ["camara.leg.br", "camara.gov.br"]

Scrapy pagination issues - new to this stuff

I am trying to make a scrapy bot that utilizes pagination but having no success...
The bot crawls through all of the links on the first page one but never goes on to the next page. I have read a ton of different threads and I cant figure this out at all. I am very new to web scraping to please feel free to hammer the crap out of my code.
import time
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http.request import Request
from tutorial.items import TutorialItem
#from scrapy_tutorial.items import ScrapyTutorialItem
class raytheonJobsPageSpider(CrawlSpider):
name = "raytheonJobsStart"
allowed_domains = ["jobs.raytheon.com"]
start_urls = [
"https://jobs.raytheon.com/search-jobs"
]
rules = ( Rule(LinkExtractor(restrict_xpaths=('//div[#class="next"]',)), callback='parse_listings',follow=True), )
def parse_start_url(self, response):
'''
Crawl start URLs
'''
return self.parse_listings(response)
def parse_listings(self, response):
'''
Extract data from listing pages
'''
sel = Selector(response)
jobs = response.xpath(
'//*[#id="search-results-list"]/ul/*/a/#href'
).extract()
nextLink = response.xpath('//a[#class="next"]').extract()
print "This is just the next page link - ",nextLink
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url, job_url)
yield Request(job_url, callback=self.parse_details)
def parse_details(self, response):
'''
Extract data from details pages
'''
sel = Selector(response)
job = sel.xpath('//*[#id="content"]')
item = TutorialItem()
# Populate job fields
item['title'] = job.xpath('//*[#id="content"]/section[1]/div/h1/text()').extract()
jobTitle=job.xpath('//*[#id="content"]/section[1]/div/h1/text()').extract()
item['reqid'] = job.xpath('//*[#id="content"]/section[1]/div/span[1]/text()').extract()
item['location'] = job.xpath('//*[#id="content"]/section[1]/div/span[last()]/text()').extract()
item['applink'] = job.xpath('//*[#id="content"]/section[1]/div/a[2]/#href').extract()
item['description'] = job.xpath('//*[#id="content"]/section[1]/div/div').extract()
item['clearance'] = job.xpath('//*[#id="content"]/section[1]/div/*/text()').extract()
#item['page_url'] = response.url
item = self.__normalise_item(item, response.url)
time.sleep(1)
return item
def __normalise_item(self, item, base_url):
'''
Standardise and format item fields
'''
# Loop item fields to sanitise data and standardise data types
for key, value in vars(item).values()[0].iteritems():
item[key] = self.__normalise(item[key])
# Convert job URL from relative to absolute URL
#item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])
return item
def __normalise(self, value):
print self,value
# Convert list to string
value = value if type(value) is not list else ' '.join(value)
# Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
value = value.strip()
return value
def __to_absolute_url(self, base_url, link):
'''
Convert relative URL to absolute URL
'''
import urlparse
link = urlparse.urljoin(base_url, link)
return link
def __to_int(self, value):
'''
Convert value to integer type
'''
try:
value = int(value)
except ValueError:
value = 0
return value
def __to_float(self, value):
'''
Convert value to float type
'''
try:
value = float(value)
except ValueError:
value = 0.0
return value

You dont need PhantomJS or Splash.
By inspecting the AJAX calls I found that they are loading jobs via AJAX calls to this URL
You can see CurrentPage parameter at the end of URL.
And the result is returned in JSON format, and all jobs are on the key named results
I created a project on my side and I created fully 100% working code for you. Here is link to that in github, just download and run it ... you dont have to do anything at all :P
Download whole working project fomr here https://github.com/mani619cash/raytheon_pagination
Basic logic is here
class RaytheonspiderSpider(CrawlSpider):
name = "raytheonJobsStart"
page = 180
ajaxURL = "https://jobs.raytheon.com/search-jobs/results?ActiveFacetID=0&RecordsPerPage=15&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=5&SortDirection=1&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&CurrentPage="
def start_requests(self):
yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)
def parse_listings(self, response):
resp = json.loads(response.body)
response = Selector(text = resp['results'])
jobs = response.xpath('//*[#id="search-results-list"]/ul/*/a/#href').extract()
if jobs:
for job_url in jobs:
job_url = "https://jobs.raytheon.com" + self.__normalise(job_url)
#job_url = self.__to_absolute_url(response.url, job_url)
yield Request(url=job_url, callback=self.parse_details)
else:
raise CloseSpider("No more pages... exiting...")
# go to next page...
self.page = self.page + 1
yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)

Change
restrict_xpaths=('//div[#class="next"]',)) to
restrict_xpaths=('//a[#class="next"]',))
If this not working then do a recursive call to parse_listings function
def parse_listings(self, response):
'''
Extract data from listing pages
'''
sel = Selector(response)
jobs = response.xpath(
'//*[#id="search-results-list"]/ul/*/a/#href'
).extract()
nextLink = response.xpath('//a[#class="next"]').extract()
print "This is just the next page link - ",nextLink
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url, job_url)
yield Request(job_url, callback=self.parse_details)
yield Request(pagination link here, callback=self.parse_listings)
I am on mobile so cant type code. I hope the logic i told you makes sense

Scrapy send condition to parse from start_requests(self)

Im scraping a website which has different rows base on the type of item that Im scraping. I have a working scraper that looks like the 1st blockcode below, however, I would like to be able to take a type from the database and send from the start_requests(self) to the parse function. I have 11 different types, that all have different number of rows for one table on some part of the page, whereas the rest of the rows in the other tables on the page are the same. I have tried showing the code in the 2nd blockcode.
How do I accomplish taking the type from the database in the start_requests, and sending it to parse?
1st blockcode
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID FROM dbo.infostage")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
yield self.make_requests_from_url(url+row[0])
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = ''.join(response.url)
id = InfoID[29:len(InfoID)-1]
for info in infodata:
item = infoItem()
# Details
item['id'] = id #response.url
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
itemPool.append(item)
yield item
pass
2nd blockcode
This does not work, but Im not sure how to get it working. Do I create a global list, a new function?
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, type FROM dbo.infostage")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
type = row[1] # how do I send this value to the parse function?
yield self.make_requests_from_url(url+row[0])
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input base path
itemPool = []
InfoID = ''.join(response.url)
id = InfoID[29:len(InfoID)-1]
for info in infodata:
item = infoItem()
# Details
item['id'] = id #response.url
# Here I need to implement a condition that comes from def start_requests(self).
# If condition meet then scrape the following fields else the next
if type = 'type1':
# This is where I would like to use it.
# I have 11 different types, that all have different number of rows for one table on some part of the page, whereas the rest of the rows in the other tables on the page are the same.
# Type 1
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
else:
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
itemPool.append(item)
yield item
pass
Thank you all for your help and insight!

You can use request.meta
def make_requests_from_url(self, url, type, callback):
request = scrapy.Request(url, callback)
request.meta['type'] = type
return request
In parse you can access type using response.meta['type']

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy merge subsite-item with site-item - python

Related

Trying to add multiple yields into a single json file using Scrapy

Scrapy, crawling a dynamic page with multiple pages

Scrapy Spider not following Request callback using yield

Scrapy pagination issues - new to this stuff

Scrapy send condition to parse from start_requests(self)

Categories

Resources