I am trying to build a spider, that gathers information regarding startups. Therefore I wrote a Python script with scrapy that should access the website and store the information in a dictionary. I think the code should work from a logik point of view, but somehow I do not get any output. My code:
import scrapy
class StartupsSpider(scrapy.Spider):
name = 'startups'
#name of the spider
allowed_domains = ['www.bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
#list of allowed domains
start_urls = ['https://bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
#starting url
def parse(self, response):
startups = response.xpath('//*[contains(#class,"card-link-overlay")]/#href').getall()
#parse initial start URL for the specific startup URL
for startup in startups:
absolute_url = response.urljoin(startup)
yield scrapy.Request(absolute_url, callback=self.parse_startup)
#parse the actual startup information
next_page_url = response.xpath('//*[#class ="pagination-link"]/#href').get()
#link to next page
absolute_next_page_url = response.urljoin(next_page_url)
#go through all pages on start URL
yield scrapy.Request(absolute_next_page_url)
def parse_startup(self, response):
#get information regarding startup
startup_name = response.css('h1::text').get()
startup_hompage = response.xpath('//*[#class="document-info-item"]/a/#href').get()
startup_description = response.css('div.document-info-item::text')[16].get()
branche = response.css('div.document-info-item::text')[4].get()
founded = response.xpath('//*[#class="date"]/text()')[0].getall()
employees = response.css('div.document-info-item::text')[9].get()
capital = response.css('div.document-info-item::text')[11].get()
applied_for_invest = response.xpath('//*[#class="date"]/text()')[1].getall()
contact_name = response.css('p.card-title-subtitle::text').get()
contact_phone = response.css('p.tel > span::text').get()
contact_mail = response.xpath('//*[#class ="person-contact"]/p/a/span/text()').get()
contact_address_street = response.xpath('//*[#class ="adr"]/text()').get()
contact_address_plz = response.xpath('//*[#class ="locality"]/text()').getall()
contact_state = response.xpath('//*[#class ="country-name"]/text()').get()
yield{'Startup':startup_name,
'Homepage': startup_hompage,
'Description': startup_description,
'Branche': branche,
'Gründungsdatum': founded,
'Anzahl Mitarbeiter':employees,
'Kapital Bedarf':capital,
'Datum des Förderbescheids':applied_for_invest,
'Contact': contact_name,
'Telefon':contact_phone,
'E-Mail':contact_mail,
'Adresse': contact_address_street + contact_address_plz + contact_state}
You're not getting output because your allowed_domains is wrong.
In the last line (Adresse), you're trying to concatenate list and str types so you'll get an error.
Your pagination link is wrong, in the first page you're getting the next page, and in the second page you're getting the previous page.
You're not doing any error checking. In some pages you're getting None for some of the values and you're trying to get their i'th character which results in an error.
I fixed 1, 2, and 3. But you'll need to fix number 4 yourself.
import scrapy
class StartupsSpider(scrapy.Spider):
# name of the spider
name = 'startups'
# list of allowed domains
allowed_domains = ['bmwk.de']
# starting url
start_urls = ['https://bmwk.de/Navigation/DE/InvestDB/INVEST-DB_Liste/investdb.html']
def parse(self, response):
# parse initial start URL for the specific startup URL
startups = response.xpath('//*[contains(#class,"card-link-overlay")]/#href').getall()
for startup in startups:
absolute_url = response.urljoin(startup)
# parse the actual startup information
yield scrapy.Request(absolute_url, callback=self.parse_startup)
# link to next page
next_page_url = response.xpath('(//*[#class ="pagination-link"])[last()]/#href').get()
if next_page_url:
# go through all pages on start URL
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_startup(self, response):
# get information regarding startup
startup_name = response.css('h1::text').get()
startup_hompage = response.xpath('//*[#class="document-info-item"]/a/#href').get()
# for example for some of the pages you'll get an error here:
startup_description = response.css('div.document-info-item::text')[16].get()
branche = response.css('div.document-info-item::text')[4].get()
founded = response.xpath('//*[#class="date"]/text()')[0].getall()
employees = response.css('div.document-info-item::text')[9].get()
capital = response.css('div.document-info-item::text')[11].get()
applied_for_invest = response.xpath('//*[#class="date"]/text()')[1].getall()
contact_name = response.css('p.card-title-subtitle::text').get()
contact_phone = response.css('p.tel > span::text').get()
contact_mail = response.xpath('//*[#class ="person-contact"]/p/a/span/text()').get()
Adresse = ' '.join(response.xpath('//*[#class ="address"]//text()').getall())
yield {'Startup': startup_name,
'Homepage': startup_hompage,
'Description': startup_description,
'Branche': branche,
'Gründungsdatum': founded,
'Anzahl Mitarbeiter': employees,
'Kapital Bedarf': capital,
'Datum des Förderbescheids': applied_for_invest,
'Contact': contact_name,
'Telefon': contact_phone,
'E-Mail': contact_mail,
'Adresse': Adresse}
you need to run in prompt:
scrapy crawl -o filename.(json or csv)
I have this piece of code, where I try to download these papers but the loop prints the first element only.
import scrapy
from urllib.parse import urljoin
class SimpleSpider(scrapy.Spider):
name = 'simple'
start_urls = ['https://jmedicalcasereports.biomedcentral.com/articles?query=COVID-19&searchType=journalSearch&tab=keyword']
def parse(self, response):
for book in response.xpath('//*[#id="main-content"]/div/main/div[2]/ol'):
title= response.xpath('/li[3]/article/h3/a/text()').get()
link = urljoin(
'https://jmedicalcasereports.biomedcentral.com/',response.xpath('/li[3]/article/ul/li[2]/a/#href').get()
)
yield {
'Title':title,
'file_urls':[link]
}
I used css, and then xpath, problem is with loop code.
Firstly, in the third line of your code, response could be changed to title
title= book.xpath('.//a/text()').get()
Secondly, in your second line, you give an incorrect xpath. So the result is not correct. This is my code. Hope this can help you.
def parse(self, response):
for book in response.xpath('//li[#class = "c-listing__item"]'):
title= book.xpath('.//a/text()').get()
link = urljoin(
'https://jmedicalcasereports.biomedcentral.com/',book.xpath('.//a/#href').get()
)
yield {
'Title':title,
'file_urls':[link]
}
The response is :
{'Title': 'Presentation of COVID-19 infection with bizarre behavior and
encephalopathy: a case report', 'file_urls':
['https://jmedicalcasereports.biomedcentral.com/articles/10.1186/s13256-021-
02851-0']}
2022-04-17 21:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200
https://jmedicalcasereports.biomedcentral.com/articles?query=COVID-
19&searchType=journalSearch&tab=keyword>
{'Title': 'Dysentery as the only presentation of COVID-19 in a child: a\xa0case
report', 'file_urls':
['https://jmedicalcasereports.biomedcentral.com/articles/10.1186/s13256-021-
02672-1']}
I have been trying to get all the properties from this website.
When I access all of them on the main search page I can retrieve all the information from all the properties, however when I need the information from actual property link, it only seems to go through one property link.
The main issue is in the link part, so when I actually try to access the link of the property. I only get the link and information from the first property but not from all the others.
class PropDataSpider(scrapy.Spider):
name = "remax"
start_urls = ['https://www.remax.co.za/property_search/for-sale/?minprice=100000&maxprice=1000000000&displayorder=date&cities=432']
def parse(self, response):
propertes = response.xpath("//div[#class='w-container main-content remodal-bg']")
for prop in propertes:
link = 'http://www.remax.co.za/' + prop.xpath("./a/#href").extract_first()
agency = self.name
title = prop.xpath(
".//div[#class='property-item']/div[#class='w-clearfix']/p[#class='property-type']/text()").extract_first().strip()
price = prop.xpath(
".//div[#class='property-item']/div[#class='w-clearfix']/div/strong/text()").extract_first().strip()
...
yield scrapy.Request(
link,
callback=self.parse_property,
meta={
'agency': agency,
'title': title,
'price': price,
'description': description,
'bedrooms': bedrooms,
'bathrooms': bathrooms,
'garages': garages,
}
)
def parse_property(self, response):
agency = response.meta["agency"]
title = response.meta["title"]
price = response.meta["price"]
description = response.meta["description"]
bedrooms = response.meta["bedrooms"]
bathrooms = response.meta["bathrooms"]
garages = response.meta["garages"]
yield {'agency': agency, 'title': title, 'price': price, "description": description, 'bedrooms': bedrooms,'bathrooms': bathrooms, 'garages': garages}
What I would like to get is all the other links to properties. I am not sure what I am doing wrong and how to fix this.
Thank you very much for help!
You need couple of changes:
properties = response.xpath("//div[#class='w-container main-content remodal-bg']/a")
for prop in properties:
link = 'http://www.remax.co.za/' + prop.xpath("./#href").extract_first()
I am having following structure (sample). i am using scrapy to extract the details. I need to extract the fields of 'href' and text like 'Accounting'. I am using the following code. I am new to Xpath. any help to extarct the specific fields .
<div class = 'something'>
<ul>
<li>Accounting</li>
<li>Administrative</li>
<li>Advertising</li>
<li>Airline</li>
</ul>
</div>
My code is:
from scrapy.spider import BaseSpider
from jobfetch.items import JobfetchItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
class JobFetchSpider(BaseSpider):
"""Spider for regularly updated livingsocial.com site, San Francisco Page"""
name = "Jobsearch"
allowed_domains = ["jobsearch.about.com/"]
start_urls = ['http://jobsearch.about.com/od/job-titles/fl/job-titles-a-z.htm']
def parse(self, response):
count = 0
for sel in response.xpath('//*[#id="main"]/div/div[2]/div[1]/div/div[2]/article/div[2]/ul[1]'):
item = JobfetchItem()
item['title'] = sel.extract()
item['link'] = sel.extract()
count = count+1
print item
yield item
The problems you have in the code:
yield item should be inside the loop since you are instantiating items there
the xpath you have is pretty messy and not quite reliable since it heavily relies on the elements location inside parent tags and starts from almost the top parent of the document
your xpath is incorrect - it should go down to the a elements inside li inside ul
sel.extract() would only give you that ul element extracted
For the sake of an example, use a CSS selector here to get to the li tags:
import scrapy
from jobfetch.items import JobfetchItem
class JobFetchSpider(scrapy.Spider):
name = "Jobsearch"
allowed_domains = ["jobsearch.about.com/"]
start_urls = ['http://jobsearch.about.com/od/job-titles/fl/job-titles-a-z.htm']
def parse(self, response):
for sel in response.css('article[itemprop="articleBody"] div.expert-content-text > ul > li > a'):
item = JobfetchItem()
item['title'] = sel.xpath('text()').extract()[0]
item['link'] = sel.xpath('#href').extract()[0]
yield item
Running the spider produces:
{'link': u'http://jobsearch.about.com/od/job-title-samples/a/accounting-job-titles.htm', 'title': u'Accounting'}
{'link': u'http://jobsearch.about.com/od/job-title-samples/a/admin-job-titles.htm', 'title': u'Administrative'}
...
{'link': u'http://jobsearch.about.com/od/job-title-samples/fl/yacht-job-titles.htm', 'title': u'Yacht Jobs'}
FYI, we could have used xpath() also:
//article[#itemprop="articleBody"]//div[#class="expert-content-text"]/ul/li/a
Use the below script to extract the data as you want to scrape.
In [1]: response.xpath('//div[#class="expert-content-text"]/ul/li/a/text()').extract()
Out[1]:
[u'Accounting',
u'Administrative',
u'Advertising',
u'Airline',
u'Animal',
u'Alternative Energy',
u'Auction House',
u'Banking',
u'Biotechnology',
u'Business',
u'Business Intelligence',
u'Chef',
u'College Admissions',
u'College Alumni Relations and Development ',
u'College Student Services',
u'Construction',
u'Consulting',
u'Corporate',
u'Cruise Ship',
u'Customer Service',
u'Data Science',
u'Engineering',
u'Entry Level Jobs',
u'Environmental',
u'Event Planning',
u'Fashion',
u'Film',
u'First Job',
u'Fundraiser',
u'Healthcare/Medical',
u'Health/Safety',
u'Hospitality',
u'Human Resources',
u'Human Services / Social Work',
u'Information Technology (IT)',
u'Insurance',
u'International Affairs / Development',
u'International Business',
u'Investment Banking',
u'Law Enforcement',
u'Legal',
u'Maintenance',
u'Management',
u'Manufacturing',
u'Marketing',
u'Media',
u'Museum',
u'Music',
u'Non Profit',
u'Nursing',
u'Outdoor ',
u'Public Administration',
u'Public Relations',
u'Purchasing',
u'Radio',
u'Real Estate ',
u'Restaurant',
u'Retail',
u'Sales',
u'School',
u'Science',
u'Ski and Snow Jobs',
u'Social Media',
u'Social Work',
u'Sports',
u'Television',
u'Trades',
u'Transportation',
u'Travel',
u'Yacht Jobs']
In [1]: response.xpath('//div[#class="expert-content-text"]/ul/li/a/#href').extract()
Out[2]:
[u'http://jobsearch.about.com/od/job-title-samples/a/accounting-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/admin-job-titles.htm',
u'http://jobsearch.about.com/od/job-titles/a/advertising-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/airline-industry-jobs.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/animal-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/alternative-energy-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/auction-house-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/banking-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/biotechnology-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/business-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/business-intelligence-job-titles.htm',
u'http://culinaryarts.about.com/od/culinaryfundamentals/a/whatisachef.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/college-admissions-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/college-alumni-relations-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/college-student-service-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/construction-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/consulting-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/c-level-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/cruise-ship-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/customer-service-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/data-science-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/engineering-job-titles.htm',
u'http://jobsearch.about.com/od/best-jobs/a/best-entry-level-jobs.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/environmental-job-titles.htm',
u'http://eventplanning.about.com/od/eventcareers/tp/corporateevents.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/fashion-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/film-job-titles.htm',
u'http://jobsearch.about.com/od/justforstudents/a/first-job-list.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/fundraiser-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/health-care-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/health-safety-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/hospitality-job-titles.htm',
u'http://humanresources.about.com/od/HR-Roles-And-Responsibilities/fl/human-resources-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/human-services-social-work-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/it-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/insurance-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/international-affairs-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/international-business-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/investment-banking-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/law-enforcement-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/legal-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/maintenance-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/management-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/manufacturing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/marketing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/media-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/museum-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/music-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/nonprofit-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/nursing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/outdoor-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/public-administration-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/public-relations-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/purchasing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/radio-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/real-estate-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/restaurant-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/retail-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/sales-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/high-school-middle-school-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/science-job-titles.htm',
u'http://jobsearch.about.com/od/skiandsnowjobs/a/skijob2_2.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/social-media-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/social-work-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/sports-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/television-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/trades-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/transportation-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/travel-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/yacht-job-titles.htm']
I am trying to scrape a site using scrapy,
My spider is as follows:
class mySpider(CrawlSpider):
name = "mytest"
allowed_domains = {'www.example.com'}
start_urls = ['http://www.example.com']
rules = [
Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}/\w+']), callback = 'parse_post',
follow= True)
]
def parse_post(self, response):
item = PostItem()
item['url'] = response.url
item['title'] = response.xpath('//title/text()').extract()
item['authors'] = response.xpath('//span[#class="author"]/text()').extract()
return item
Everything works fine but it only scrapes the links in homepage. It allows to load more articles with post request i.e 'click for more articles'.
Is there anyway I can simulate the load more articles button to load the articles and continue the scraper?
The "Load more articles" button is managed by the javascript, clicking on ti fires up an AJAX post request.
In other words, this is something Scrapy cannot easily handle.
But, if Scrapy is not a requirement, here is a solution using requests and BeautifulSoup:
from bs4 import BeautifulSoup
import requests
url = "http://www.ijreview.com/wp-admin/admin-ajax.php"
session = requests.Session()
page_size = 24
params = {
'action': 'load_more',
'numPosts': page_size,
'category': '',
'orderby': 'date',
'time': ''
}
offset = 0
limit = 100
while offset < limit:
params['offset'] = offset
response = session.post(url, data=params)
links = [a['href'] for a in BeautifulSoup(response.content).select('li > a')]
for link in links:
response = session.get(link)
page = BeautifulSoup(response.content)
title = page.find('title').text.strip()
author = page.find('span', class_='author').text.strip()
print {'link': link, 'title': title, 'author': author}
offset += page_size
Prints:
{'author': u'Kevin Boyd', 'link': 'http://www.ijreview.com/2014/08/172770-president-obama-realizes-world-messy-place-thanks-social-media/', 'title': u'President Obama Calls The World A Messy Place & Blames Social Media for Making People Take Notice'}
{'author': u'Reid Mene', 'link': 'http://www.ijreview.com/2014/08/172405-17-politicians-weird-jobs-time-office/', 'title': u'12 Most Unusual Professions of Politicians Before They Were Elected to Higher Office'}
{'author': u'Michael Hausam', 'link': 'http://www.ijreview.com/2014/08/172653-video-duty-mp-fakes-surrender-shoots-hostage-taker/', 'title': u'Video: Off-Duty MP Fake Surrenders at Gas Station Before Revealing Deadly Surprise for Hostage Taker'}
...
You may need to tweak the code so that it supports different categories, ordering etc. You can also improve the html parsing speed by allowing BeautifulSoup to use lxml parser under-the-hood - instead of BeautifulSoup(response.content), use BeautifulSoup(response.content, "lxml"), but you would need to install lxml.
This is how you can adjust the solution to Scrapy:
import urllib
from scrapy import Item, Field, Request, Spider
class mySpider(Spider):
name = "mytest"
allowed_domains = {'www.ijreview.com'}
def start_requests(self):
page_size = 25
headers = {'User-Agent': 'Scrapy spider',
'X-Requested-With': 'XMLHttpRequest',
'Host': 'www.ijreview.com',
'Origin': 'http://www.ijreview.com',
'Accept': '*/*',
'Referer': 'http://www.ijreview.com/',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}
for offset in (0, 200, page_size):
yield Request('http://www.ijreview.com/wp-admin/admin-ajax.php',
method='POST',
headers=headers,
body=urllib.urlencode(
{'action': 'load_more',
'numPosts': page_size,
'offset': offset,
'category': '',
'orderby': 'date',
'time': ''}))
def parse(self, response):
for link in response.xpath('//ul/li/a/#href').extract():
yield Request(link, callback=self.parse_post)
def parse_post(self, response):
item = PostItem()
item['url'] = response.url
item['title'] = response.xpath('//title/text()').extract()[0].strip()
item['authors'] = response.xpath('//span[#class="author"]/text()').extract()[0].strip()
return item
Outputs:
{'authors': u'Kyle Becker',
'title': u'17 Reactions to the \u2018We Don\u2019t Have a Strategy\u2019 Gaffe That May Haunt the Rest of Obama\u2019s Presidency',
'url': 'http://www.ijreview.com/2014/08/172569-25-reactions-obamas-dont-strategy-gaffe-may-haunt-rest-presidency/'}
...