I am trying to build a spider, that gathers information regarding startups. Therefore I wrote a Python script with scrapy that should access the website and store the information in a dictionary. I think the code should work from a logik point of view, but somehow I do not get any output. My code:
import scrapy
class StartupsSpider(scrapy.Spider):
name = 'startups'
#name of the spider
allowed_domains = ['www.bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
#list of allowed domains
start_urls = ['https://bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
#starting url
def parse(self, response):
startups = response.xpath('//*[contains(#class,"card-link-overlay")]/#href').getall()
#parse initial start URL for the specific startup URL
for startup in startups:
absolute_url = response.urljoin(startup)
yield scrapy.Request(absolute_url, callback=self.parse_startup)
#parse the actual startup information
next_page_url = response.xpath('//*[#class ="pagination-link"]/#href').get()
#link to next page
absolute_next_page_url = response.urljoin(next_page_url)
#go through all pages on start URL
yield scrapy.Request(absolute_next_page_url)
def parse_startup(self, response):
#get information regarding startup
startup_name = response.css('h1::text').get()
startup_hompage = response.xpath('//*[#class="document-info-item"]/a/#href').get()
startup_description = response.css('div.document-info-item::text')[16].get()
branche = response.css('div.document-info-item::text')[4].get()
founded = response.xpath('//*[#class="date"]/text()')[0].getall()
employees = response.css('div.document-info-item::text')[9].get()
capital = response.css('div.document-info-item::text')[11].get()
applied_for_invest = response.xpath('//*[#class="date"]/text()')[1].getall()
contact_name = response.css('p.card-title-subtitle::text').get()
contact_phone = response.css('p.tel > span::text').get()
contact_mail = response.xpath('//*[#class ="person-contact"]/p/a/span/text()').get()
contact_address_street = response.xpath('//*[#class ="adr"]/text()').get()
contact_address_plz = response.xpath('//*[#class ="locality"]/text()').getall()
contact_state = response.xpath('//*[#class ="country-name"]/text()').get()
yield{'Startup':startup_name,
'Homepage': startup_hompage,
'Description': startup_description,
'Branche': branche,
'Gründungsdatum': founded,
'Anzahl Mitarbeiter':employees,
'Kapital Bedarf':capital,
'Datum des Förderbescheids':applied_for_invest,
'Contact': contact_name,
'Telefon':contact_phone,
'E-Mail':contact_mail,
'Adresse': contact_address_street + contact_address_plz + contact_state}
You're not getting output because your allowed_domains is wrong.
In the last line (Adresse), you're trying to concatenate list and str types so you'll get an error.
Your pagination link is wrong, in the first page you're getting the next page, and in the second page you're getting the previous page.
You're not doing any error checking. In some pages you're getting None for some of the values and you're trying to get their i'th character which results in an error.
I fixed 1, 2, and 3. But you'll need to fix number 4 yourself.
import scrapy
class StartupsSpider(scrapy.Spider):
# name of the spider
name = 'startups'
# list of allowed domains
allowed_domains = ['bmwk.de']
# starting url
start_urls = ['https://bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
def parse(self, response):
# parse initial start URL for the specific startup URL
startups = response.xpath('//*[contains(#class,"card-link-overlay")]/#href').getall()
for startup in startups:
absolute_url = response.urljoin(startup)
# parse the actual startup information
yield scrapy.Request(absolute_url, callback=self.parse_startup)
# link to next page
next_page_url = response.xpath('(//*[#class ="pagination-link"])[last()]/#href').get()
if next_page_url:
# go through all pages on start URL
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_startup(self, response):
# get information regarding startup
startup_name = response.css('h1::text').get()
startup_hompage = response.xpath('//*[#class="document-info-item"]/a/#href').get()
# for example for some of the pages you'll get an error here:
startup_description = response.css('div.document-info-item::text')[16].get()
branche = response.css('div.document-info-item::text')[4].get()
founded = response.xpath('//*[#class="date"]/text()')[0].getall()
employees = response.css('div.document-info-item::text')[9].get()
capital = response.css('div.document-info-item::text')[11].get()
applied_for_invest = response.xpath('//*[#class="date"]/text()')[1].getall()
contact_name = response.css('p.card-title-subtitle::text').get()
contact_phone = response.css('p.tel > span::text').get()
contact_mail = response.xpath('//*[#class ="person-contact"]/p/a/span/text()').get()
Adresse = ' '.join(response.xpath('//*[#class ="address"]//text()').getall())
yield {'Startup': startup_name,
'Homepage': startup_hompage,
'Description': startup_description,
'Branche': branche,
'Gründungsdatum': founded,
'Anzahl Mitarbeiter': employees,
'Kapital Bedarf': capital,
'Datum des Förderbescheids': applied_for_invest,
'Contact': contact_name,
'Telefon': contact_phone,
'E-Mail': contact_mail,
'Adresse': Adresse}
you need to run in prompt:
scrapy crawl -o filename.(json or csv)
Related
I'm trying to extract webpage data and wished to take the next few pages also but up to a limit, which I can alter. However, I've tested to see if I can at least extract the next few web-pages using Scrapy (As I'm trying to figure this out in Scrapy to learn it), but It only returns the items within the first page.
How do I extract the next pages while setting a limit i.e. 5 pages
For example, here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
class StatisticsItem(scrapy.Item):
ebay_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'ebay'
start_urls = ['https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?rt=nc&LH_BIN=1' +
'&LH_PrefLoc=2&mag=1&_sop=16']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get() #get name of product
price = card.xpath('.//span[#class="s-item__price"]//text()').get() #price
product_url = card.xpath('.//a[#class="s-item__link"]//#href').get() #link to product
# now do whatever you want, append to dictionary, yield as item...
summary_data = {
"Name": name,
"Price": price,
"URL": product_url
}
data = {'summary_data': summary_data}
yield scrapy.Request(product_url, meta=data, callback=self.parse_product_details)
# get the next page
next_page_url = card.xpath('.//a[#class="pagination__next icon-link"]/#href').extract_first()
# The last page do not have a valid url and ends with '#'
if next_page_url == None or str(next_page_url).endswith("#"):
self.log("eBay products collected successfully !!!")
else:
print('\n' + '-' * 30)
print('Next page: {}'.format(next_page_url))
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_product_details(self, response):
# Get the summary data
data = response.meta['summary_data']
data['location'] = response.xpath('//span[#itemprop="availableAtOrFrom"]/text()').extract_first()
yield data
process = CrawlerProcess(
settings={
'FEED_URI': 'collectible_cards.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
You can try like this first make urls then start start_requests
start_urls = ["https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?LH_BIN=1&LH_PrefLoc=2&mag=1&rt=nc&_pgn={}&_sop=16".format(i) for i in range(1,5)]
I am writing a scrapy spider to scrape Rightmove, a property website. The issue I'm having is that the property search, which consists of several pages of different house listings, is all located under the same URL.
This means that the usual process of identifying the URL of the 'next' page doesn't work. Is there any way, using scrapy and not selenium (not efficient enough for the purpose) that I can navigate through the different pages? Please see my code and the source code of the relevant 'next page' button as the IMG below.
Thanks.
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = ['https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&furnishTypes=&keywords=']
def parse(self, response):
self.logger.info('This my first spider')
address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
listings = response.xpath('//h2[#class="propertyCard-title"]')
for listing in listings:
yield{
'Listing': listing.get()
}
nextPage = response.xpath('//*[#id="l-container"]/div[3]/div/div/div/div[3]/button/div/svg/use')
nextPage = nextPage.get()
pageTest = response.css('div[class=pagination-button pagination-direction pagination-direction--next] svg a::attr(href)')
pageTest = pageTest.get()
if pageTest is not None:
pageTest = response.urljoin(pageTest)
yield scrapy.Request(pageTest,callback=self.parse)
```[![enter image description here][1]][1]
[1]: https://i.stack.imgur.com/1I1J1.png
Actually, it turns out that each page has a unique identifier in the web-link. For example, attach &index = 24, this sends you to the next page.
What you need to figure out is how to include that into the request url. Some may have several pages so we increment by +24 each time to go onto the next page. However, we could increment by +24 onto infinite, therefore we use the number of page results as a way to break. It's rather sneaky to notice at first sight! but pretty easy to overcome.
Here's a scraper that can go to these next pages as requested:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
import requests
from bs4 import BeautifulSoup
links= []
for i in range(0, 480, 24):
url = f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&index={i}&furnishTypes=&keywords='
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
ps1 = soup.find_all('span', {'class':'searchHeader-resultCount'})
for ps in ps1:
if int(ps.text.strip()) > i:
links.append(url)
else:
break
class ListingsItem(scrapy.Item):
address = Field(output_processor = TakeFirst())
listings = Field(output_processor = TakeFirst())
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = links
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
container = response.xpath('//div[#class="l-searchResults"]/div')
for sales in container:
l = ItemLoader(ListingsItem(), selector = sales)
l.add_xpath('address', '//address[#class="propertyCard-address"]/meta[#content]')
l.add_xpath('listings', '//h2[#class="propertyCard-title"]//text()[normalize-space()]')
yield l.load_item()
#self.logger.info('This my first spider')
#address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
#listings = response.xpath('//h2[#class="propertyCard-title"]')
#for listing in listings:
# yield{
# 'Listing': listing.get()
# }
process = CrawlerProcess(
settings = {
'FEED_URI': 'rightmove.jl',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(listingsSpider)
process.start()
So I'm relatively new to scrapy and am trying to get a crawler that pulls hyper links for businesses on a listing page. Here is the code:
class EmailSpider(CrawlSpider):
name = "emailcrawler"
start_urls = [
'https://www.yellowpages.com/search?search_terms=Computer+Software+%26+Services&geo_location_terms=Florence%2C+KY'
# 'https://www.yellowpages.com/search?search_terms=Computers+%26+Computer+Equipment-Service+%26+fix&geo_location_terms=FL'
]
def parse(self, response):
information = response.xpath('//*[#class="info"]')
for info in information:
website = info.xpath('.//*[#class="links"]/a/#href').extract_first()
if website != "None":
request = Request(url = website, callback=self.parse_email, errback = self.handle_error,
meta={'dont_retry': True, 'dont_redirect':True, 'handle_httpstatus_list': [302]})
request.meta['data'] = {
'Website': website
}
# yield response.follow(url = website, callback = self.parse_email)
yield request
next_page_url = response.xpath('//*[#class="next ajax-page"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url, errback = self.handle_error, meta={'dont_retry': True, 'dont_redirect':True})
def parse_email(self, response):
data = response.meta.get('data')
# try:
# emails = set(re.findall(r"[a-z0-9\.\-+_]+#[a-z0-9\.\-+_]+\.com", response.text, re.I))
# except AttributeError:
# return
# data['email'] = emails
selector = Selector(response)
for found_address in selector.re('[a-zA-Z0-9._%+-]+#[a-zA-Z0-9.-]+\.com'):
# item = EmailAddressItem()
data['email_address'] = found_address
# item['url'] = response.url
yield data
def handle_error(self, failure):
self.log("Request failed: %s" % failure.request)
Before I attempted to get scrapy to follow each link, I had it just return the list of websites that it pulled which worked perfectly. It was able to request the next page after iterating through the urls on the page and then yield the results. What I am trying to do now is to get it to go to each website that it pulls, extract an email element on that website if it is found and then return back to the loop and then try another website. The problem is that when the crawler gets a response error the crawl just stops. It also seems like even if the Request was successful, that the crawler is not going to be able to return to the original iteration through the yellowpages url. It gets stuck in one of the websites that it follows and then the for loop dies. How can I get the crawler to stay its course and keep attempting to pull from the websites it scrapes while also staying within the process of iterating through each page of the listing website. To put it simply, I need to be able to go through every single page on the initial listing page no matter what request error comes about, but have the crawler pop in and out of the websites it finds and attempt to scrape data on those sites.
class EmailSpider(CrawlSpider):
name = "followwebsite"
start_urls = [
# 'https://www.manta.com/mb_35_D000B000_000/offices_and_clinics_of_medical_doctors',
# 'https://www.chess.com/home'
# 'https://webscraper.io/test-sites/e-commerce/static'
'https://www.yellowpages.com/search?search_terms=Computer+Software+%26+Services&geo_location_terms=Florence%2C+KY'
'https://www.yellowpages.com/search?search_terms=Computers+%26+Computer+Equipment-Service+%26+fix&geo_location_terms=FL'
]
def parse(self, response):
website = response.xpath('//*[#class="links"]/a/#href')
yield from response.follow_all(website, self.parse_email)
next_page_url = response.xpath('//*[#class="next ajax-page"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url, errback = self.handle_error)
def parse_email(self, response):
selector = Selector(response)
for found_address in selector.re('[a-zA-Z0-9._%+-]+#[a-zA-Z0-9.-]+\.com'):
item = EmailAddressItem()
item['email_address'] = found_address
# item['url'] = response.url
yield item
def handle_error(self, failure):
self.log("Request failed: %s" % failure.request)
Figured it out no thanks to you bums
I am writing a web scraper to fetch a group of links
(located at tree.xpath('//div[#class="work_area_content"]/a/#href')
from a website and return the Title and Url of all the leafs sectioned by the leafs parent. I have two scrapers: one in python and one in Scrapy for Python. What is the purpose of callbacks in the Scrapy Request method? Should the information be in a multidimensional or single dimension list ( I believe multi-dimensional but it enhances complication)? Which of the below code is better? If the scraper code is better, how do I migrate the python code to the Scrapy code?
From what I understand from callbacks is that it passes a function's arguments to another function; however, if the callback refers to itself, the data gets overwritten and therefore lost, and you're unable to go back to the root data. Is this correct?
python:
url_storage = [ [ [ [] ] ] ]
page = requests.get('http://1.1.1.1:1234/TestSuites')
tree = html.fromstring(page.content)
urls = tree.xpath('//div[#class="work_area_content"]/a/#href').extract()
i = 0
j = 0
k = 0
for i, url in enumerate(urls):
absolute_url = "".join(['http://1.1.1.1:1234/', url])
url_storage[i][j][k].append(absolute_url)
print(url_storage)
#url_storage.insert(i, absolute_url)
page = requests.get(url_storage[i][j][k])
tree2 = html.fromstring(page.content)
urls2 = tree2.xpath('//div[#class="work_area_content"]/a/#href').extract()
for j, url2 in enumerate(urls2):
absolute_url = "".join(['http://1.1.1.1:1234/', url2])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree3 = html.fromstring(page.content)
urls3 = tree3.xpath('//div[#class="work_area_content"]/a/#href').extract()
for k, url3 in enumerate(urls3):
absolute_url = "".join(['http://1.1.1.1:1234/', url3])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree4 = html.fromstring(page.content)
urls3 = tree4.xpath('//div[#class="work_area_content"]/a/#href').extract()
title = tree4.xpath('//span[#class="page_title"]/text()').extract()
yield Request(url_storage[i][j][k], callback=self.end_page_parse_TS, meta={"Title": title, "URL": urls3 })
#yield Request(absolute_url, callback=self.end_page_parse_TC, meta={"Title": title, "URL": urls3 })
def end_page_parse_TS(self, response):
print(response.body)
url = response.meta.get('URL')
title = response.meta.get('Title')
yield{'URL': url, 'Title': title}
def end_page_parse_TC(self, response):
url = response.meta.get('URL')
title = response.meta.get('Title')
description = response.meta.get('Description')
description = response.xpath('//table[#class="wiki_table]/tbody[contains(/td/text(), "description")/parent').extract()
yield{'URL': url, 'Title': title, 'Description':description}
Scrapy:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
class DatabloggerSpider(CrawlSpider):
# The name of the spider
name = "datablogger"
# The domains that are allowed (links to other domains are skipped)
allowed_domains = ['http://1.1.1.1:1234/']
# The URLs to start with
start_urls = ['http://1.1.1.1:1234/TestSuites']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
# Method which starts the requests by visiting all URLs specified in start_urls
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
# Method for parsing items
def parse_items(self, response):
# The list of items that are found on the particular page
items = []
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
# Now go through all the found links
item = DatabloggerScraperItem()
item['url_from'] = response.url
for link in links:
item['url_to'] = link.url
items.append(item)
# Return all the found items
return items
I'm new to scrapy and I cant get my spider to enter parse_votes in code bellow, even though I set it as callback. The others parse methods are working fine, I don't get any ERROR and checked the 'link' variable which has the correct info. HELP?
EDIT - Full code
class DeputadosSpider(scrapy.Spider):
name = "deputies"
allowed_domains = ["camara.leg.br"]
start_urls = ["http://www2.camara.leg.br/deputados/pesquisa"]
def parse(self, response):
sel = Selector(response)
sel_options = sel.xpath('//*[#id="deputado"]/option[position()>1]')
iteration = 1
# get deputies pages
for sel_option in sel_options:
item = DeputiesInfo()
item["war_name"] = sel_option.xpath("text()").extract()
item["link_id"] = sel_option.extract().partition('?')[-1].rpartition('"')[0]
item["page_link"] = 'http://www.camara.leg.br/internet/Deputado/dep_Detalhe.asp?id=' + item["link_id"]
item["id"] = iteration
iteration += 1
# go scrap their page
yield scrapy.Request(item["page_link"], callback=self.parse_deputy, meta={'item': item})
def parse_deputy(self, response):
item = response.meta['item']
sel = Selector(response)
info = sel.xpath('//div[#id="content"]/div/div[1]/ul/li')
# end to fill the data
item["full_name"] = info.xpath("text()").extract_first()
item["party"] = info.xpath("text()").extract()[2].partition('/')[0]
item["uf"] = info.xpath("text()").extract()[2].partition('/')[-1].rpartition('/')[0]
item["legislatures"] = info.xpath("text()").extract()[5]
item["picture"] = sel.xpath('//div[#id="content"]/div/div[1]//img[1]/#src').extract()
# save data to json file
file = open('deputies_info.json', 'a')
line = json.dumps(dict(item)) + ",\n"
file.write(line)
# colect votes info
get_years = sel.xpath('//*[#id="my-informations"]/div[3]/div/ul/li[1]/a[position()<4]')
for get_year in get_years:
vote = VotesInfo()
vote["deputy_id"] = item["id"]
vote["year"] = get_year.xpath("text()").extract_first()
link = get_year.xpath("#href").extract_first()
print(vote["year"])
print(link)
# go to voting pages
yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote})
def parse_votes(self, response):
#vote = response.meta['vote']
print('YYYYYYYYYYYYYUHUL IM IN!!')
Your problem is allowed_domains, because the link you are trying to request in parse_deputy is for example: http://www.camara.gov.br/internet/deputado/RelVotacoes.asp?nuLegislatura=55&nuMatricula=410&dtInicio=01/01/2016&dtFim=30/12/2016
and its domain is camara.gov.br so add it to allowed_domains.
allowed_domains = ["camara.leg.br", "camara.gov.br"]
PS: I ran your code commentingallowed_domains, and parse_votes works perfectly.
I ran your spider and found why it nerver enters parse_votes.
I checked the link in yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote}) and found out that it is not in the same domain
The link belongs to the camara.gov.br domain, which does not belong to the allowed_domains = ["camara.leg.br"]
So you need to add this domain to the allowed_domains list.
allowed_domains = ["camara.leg.br", "camara.gov.br"]