I'm new to scrapy and I cant get my spider to enter parse_votes in code bellow, even though I set it as callback. The others parse methods are working fine, I don't get any ERROR and checked the 'link' variable which has the correct info. HELP?
EDIT - Full code
class DeputadosSpider(scrapy.Spider):
name = "deputies"
allowed_domains = ["camara.leg.br"]
start_urls = ["http://www2.camara.leg.br/deputados/pesquisa"]
def parse(self, response):
sel = Selector(response)
sel_options = sel.xpath('//*[#id="deputado"]/option[position()>1]')
iteration = 1
# get deputies pages
for sel_option in sel_options:
item = DeputiesInfo()
item["war_name"] = sel_option.xpath("text()").extract()
item["link_id"] = sel_option.extract().partition('?')[-1].rpartition('"')[0]
item["page_link"] = 'http://www.camara.leg.br/internet/Deputado/dep_Detalhe.asp?id=' + item["link_id"]
item["id"] = iteration
iteration += 1
# go scrap their page
yield scrapy.Request(item["page_link"], callback=self.parse_deputy, meta={'item': item})
def parse_deputy(self, response):
item = response.meta['item']
sel = Selector(response)
info = sel.xpath('//div[#id="content"]/div/div[1]/ul/li')
# end to fill the data
item["full_name"] = info.xpath("text()").extract_first()
item["party"] = info.xpath("text()").extract()[2].partition('/')[0]
item["uf"] = info.xpath("text()").extract()[2].partition('/')[-1].rpartition('/')[0]
item["legislatures"] = info.xpath("text()").extract()[5]
item["picture"] = sel.xpath('//div[#id="content"]/div/div[1]//img[1]/#src').extract()
# save data to json file
file = open('deputies_info.json', 'a')
line = json.dumps(dict(item)) + ",\n"
file.write(line)
# colect votes info
get_years = sel.xpath('//*[#id="my-informations"]/div[3]/div/ul/li[1]/a[position()<4]')
for get_year in get_years:
vote = VotesInfo()
vote["deputy_id"] = item["id"]
vote["year"] = get_year.xpath("text()").extract_first()
link = get_year.xpath("#href").extract_first()
print(vote["year"])
print(link)
# go to voting pages
yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote})
def parse_votes(self, response):
#vote = response.meta['vote']
print('YYYYYYYYYYYYYUHUL IM IN!!')
Your problem is allowed_domains, because the link you are trying to request in parse_deputy is for example: http://www.camara.gov.br/internet/deputado/RelVotacoes.asp?nuLegislatura=55&nuMatricula=410&dtInicio=01/01/2016&dtFim=30/12/2016
and its domain is camara.gov.br so add it to allowed_domains.
allowed_domains = ["camara.leg.br", "camara.gov.br"]
PS: I ran your code commentingallowed_domains, and parse_votes works perfectly.
I ran your spider and found why it nerver enters parse_votes.
I checked the link in yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote}) and found out that it is not in the same domain
The link belongs to the camara.gov.br domain, which does not belong to the allowed_domains = ["camara.leg.br"]
So you need to add this domain to the allowed_domains list.
allowed_domains = ["camara.leg.br", "camara.gov.br"]
Related
I am trying to build a spider, that gathers information regarding startups. Therefore I wrote a Python script with scrapy that should access the website and store the information in a dictionary. I think the code should work from a logik point of view, but somehow I do not get any output. My code:
import scrapy
class StartupsSpider(scrapy.Spider):
name = 'startups'
#name of the spider
allowed_domains = ['www.bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
#list of allowed domains
start_urls = ['https://bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
#starting url
def parse(self, response):
startups = response.xpath('//*[contains(#class,"card-link-overlay")]/#href').getall()
#parse initial start URL for the specific startup URL
for startup in startups:
absolute_url = response.urljoin(startup)
yield scrapy.Request(absolute_url, callback=self.parse_startup)
#parse the actual startup information
next_page_url = response.xpath('//*[#class ="pagination-link"]/#href').get()
#link to next page
absolute_next_page_url = response.urljoin(next_page_url)
#go through all pages on start URL
yield scrapy.Request(absolute_next_page_url)
def parse_startup(self, response):
#get information regarding startup
startup_name = response.css('h1::text').get()
startup_hompage = response.xpath('//*[#class="document-info-item"]/a/#href').get()
startup_description = response.css('div.document-info-item::text')[16].get()
branche = response.css('div.document-info-item::text')[4].get()
founded = response.xpath('//*[#class="date"]/text()')[0].getall()
employees = response.css('div.document-info-item::text')[9].get()
capital = response.css('div.document-info-item::text')[11].get()
applied_for_invest = response.xpath('//*[#class="date"]/text()')[1].getall()
contact_name = response.css('p.card-title-subtitle::text').get()
contact_phone = response.css('p.tel > span::text').get()
contact_mail = response.xpath('//*[#class ="person-contact"]/p/a/span/text()').get()
contact_address_street = response.xpath('//*[#class ="adr"]/text()').get()
contact_address_plz = response.xpath('//*[#class ="locality"]/text()').getall()
contact_state = response.xpath('//*[#class ="country-name"]/text()').get()
yield{'Startup':startup_name,
'Homepage': startup_hompage,
'Description': startup_description,
'Branche': branche,
'Gründungsdatum': founded,
'Anzahl Mitarbeiter':employees,
'Kapital Bedarf':capital,
'Datum des Förderbescheids':applied_for_invest,
'Contact': contact_name,
'Telefon':contact_phone,
'E-Mail':contact_mail,
'Adresse': contact_address_street + contact_address_plz + contact_state}
You're not getting output because your allowed_domains is wrong.
In the last line (Adresse), you're trying to concatenate list and str types so you'll get an error.
Your pagination link is wrong, in the first page you're getting the next page, and in the second page you're getting the previous page.
You're not doing any error checking. In some pages you're getting None for some of the values and you're trying to get their i'th character which results in an error.
I fixed 1, 2, and 3. But you'll need to fix number 4 yourself.
import scrapy
class StartupsSpider(scrapy.Spider):
# name of the spider
name = 'startups'
# list of allowed domains
allowed_domains = ['bmwk.de']
# starting url
start_urls = ['https://bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
def parse(self, response):
# parse initial start URL for the specific startup URL
startups = response.xpath('//*[contains(#class,"card-link-overlay")]/#href').getall()
for startup in startups:
absolute_url = response.urljoin(startup)
# parse the actual startup information
yield scrapy.Request(absolute_url, callback=self.parse_startup)
# link to next page
next_page_url = response.xpath('(//*[#class ="pagination-link"])[last()]/#href').get()
if next_page_url:
# go through all pages on start URL
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_startup(self, response):
# get information regarding startup
startup_name = response.css('h1::text').get()
startup_hompage = response.xpath('//*[#class="document-info-item"]/a/#href').get()
# for example for some of the pages you'll get an error here:
startup_description = response.css('div.document-info-item::text')[16].get()
branche = response.css('div.document-info-item::text')[4].get()
founded = response.xpath('//*[#class="date"]/text()')[0].getall()
employees = response.css('div.document-info-item::text')[9].get()
capital = response.css('div.document-info-item::text')[11].get()
applied_for_invest = response.xpath('//*[#class="date"]/text()')[1].getall()
contact_name = response.css('p.card-title-subtitle::text').get()
contact_phone = response.css('p.tel > span::text').get()
contact_mail = response.xpath('//*[#class ="person-contact"]/p/a/span/text()').get()
Adresse = ' '.join(response.xpath('//*[#class ="address"]//text()').getall())
yield {'Startup': startup_name,
'Homepage': startup_hompage,
'Description': startup_description,
'Branche': branche,
'Gründungsdatum': founded,
'Anzahl Mitarbeiter': employees,
'Kapital Bedarf': capital,
'Datum des Förderbescheids': applied_for_invest,
'Contact': contact_name,
'Telefon': contact_phone,
'E-Mail': contact_mail,
'Adresse': Adresse}
you need to run in prompt:
scrapy crawl -o filename.(json or csv)
As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.
Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Im trying to scrape details from a subsite and merge with the details scraped with site. I've been researching through stackoverflow, as well as documentation. However, I still cant get my code to work. It seems that my function to extract additional details from the subsite does not work. If anyone could take a look I would be very grateful.
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, category FROM dbo.StageItem")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
InfoID = row[0]
category = row[1]
yield self.make_requests_from_url(url+InfoID, InfoID, category, self.parse)
def make_requests_from_url(self, url, InfoID, category, callback):
request = Request(url, callback)
request.meta['InfoID'] = InfoID
request.meta['category'] = category
return request
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = response.meta['InfoID']
category = response.meta['category']
for info in infodata:
item = infoItem()
item_cur, item_hist = InfoItemSubSite()
# Stem Details
item['id'] = InfoID
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item_cur['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item_cur['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()
# Extract additional information about item_cur from refering site
# This part does not work
if item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract():
url = 'http://www.nevermind.com/info/sub/' + item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()[0]
request = Request(url, housingtype, self.parse_item_sub)
request.meta['category'] = category
yield self.parse_item_sub(url, category)
item_his['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_his['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
item_his['field7'] = info.xpath('tr[7]/td[2]/p/b/#href').extract()
item['subsite_dic'] = [dict(item_cur), dict(item_his)]
itemPool.append(item)
yield item
pass
# Function to extract additional info from the subsite, and return it to the original item.
def parse_item_sub(self, response, category):
hxs = Selector(response)
subsite = hxs.xpath('div/div[2]') # input base path
category = response.meta['category']
for i in subsite:
item = InfoItemSubSite()
if (category == 'first'):
item['subsite_field1'] = i.xpath('/td[2]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[2]/text()').extract()
item['subsite_field3'] = i.xpath('/div[5]/a[1]/#href').extract()
else:
item['subsite_field1'] = i.xpath('/tr[10]/td[3]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[1]/text()').extract()
item['subsite_field3'] = i.xpath('/div[7]/a[1]/#href').extract()
return item
pass
I've been looking at these examples together with a lot of other examples (stackoverflow is great for that!), as well as scrapy documentation, but still unable to understand how I get details send from one function and merged with the scraped items from the original function.
how do i merge results from target page to current page in scrapy?
How can i use multiple requests and pass items in between them in scrapy python
What you are looking here is called request chaining. Your problem is - yield one item from several requests. A solution to this is to chain requests while carrying your item in requests meta attribute.
Example:
def parse(self, response):
item = MyItem()
item['name'] = response.xpath("//div[#id='name']/text()").extract()
more_page = # some page that offers more details
# go to more page and take your item with you.
yield Request(more_page,
self.parse_more,
meta={'item':item})
def parse_more(self, response):
# get your item from the meta
item = response.meta['item']
# fill it in with more data and yield!
item['last_name'] = response.xpath("//div[#id='lastname']/text()").extract()
yield item
I have a ScraPy Code that is running in shell, but when I try to export it to csv, it returns an empty file. It exports data when I do not go into a link and try to parse the description, but once I add the extra method of parsing the contents, it fails to work. Here is the code:
class MonsterSpider(CrawlSpider):
name = "monster"
allowed_domains = ["jobs.monster.com"]
base_url = "http://jobs.monster.com/v-technology.aspx?"
start_urls = [
"http://jobs.monster.com/v-technology.aspx"
]
for i in range(1,5):
start_urls.append(base_url + "page=" + str(i))
rules = (Rule(SgmlLinkExtractor(allow=("jobs.monster.com",))
, callback = 'parse_items'),)
def parse_items(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="col-xs-12"]')
#items = []
for site in sites.xpath('.//article[#class="js_result_row"]'):
item = MonsterItem()
item['title'] = site.xpath('.//span[#itemprop = "title"]/text()').extract()
item['company'] = site.xpath('.//span[#itemprop = "name"]/text()').extract()
item['city'] = site.xpath('.//span[#itemprop = "addressLocality"]/text()').extract()
item['state'] = site.xpath('.//span[#itemprop = "addressRegion"]/text()').extract()
item['link'] = site.xpath('.//a[#data-m_impr_a_placement_id= "jsr"]/#href').extract()
follow = ''.join(item["link"])
request = Request(follow, callback = self.parse_dir_contents)
request.meta["item"] = item
yield request
#items.append(item)
#return items
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = site.xpath('.//div[#itemprop = "description"]/text()').extract()
return item
Taking out the parse_dir_contents and uncommenting the empty "lists" list and "append" code was the original code.
Well, as #tayfun suggests you should use response.xpath or define the site variable.
By the way, you do not need to use sel = Selector(response). Responses come with the xpath function, there is no need to cover it into another selector.
However the main problem is that you restrict the domain of the spider. You define allowed_domains = ["jobs.monster.com"] however if you look at the URL to follow of your custom Request you can see that they are something like http://jobview.monster.com/ or http://job-openings.monster.com. In this case your parse_dir_contents is not executed (the domain is not allowed) and your item does not get returned so you won't get any results.
Change allowed_domains = ["jobs.monster.com"] to
allowed_domains = ["monster.com"]
and you will be fine and your app will work and return items.
You have an error in your parse_dir_contents method:
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = response.xpath('.//div[#itemprop=description"]/text()').extract()
return item
Note the use of response. I don't know where you got site that you are currently using from.
Also, try to provide the error details when you post a question. Writing "it fails to work" doesn't say much.
I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()