Crawling the following page: http://graphics.stltoday.com/apps/payrolls/salaries/teachers/detail/25074/ and I'm trying to grab each value from the table (salary, job title, years with district, etc). When I attempt to access these from scrapy shell, they're all displaying when I use response.xpath('//th[#scope="row"]/following-sibling::td[1]/text()').extract() However, when I do this within the crawler, only the first element (district) displays. Any suggestions?
Crawler code (Ideally, each element would go into its own variable for cleaner output:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class Spider2(CrawlSpider):
#name of the spider
name = 'stlteacher'
#list of allowed domains
allowed_domains = ['graphics.stltoday.com']
#starting url for scraping
start_urls = ['http://graphics.stltoday.com/apps/payrolls/salaries/teachers/']
rules = [
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/teachers/[0-9]+/$']),
follow=True),
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/teachers/[0-9]+/position/[0-9]+/$']),
follow=True),
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/teachers/detail/[0-9]+/$']),
callback='parse_item',
follow=True),
]
#setting the location of the output csv file
custom_settings = {
'FEED_FORMAT' : "csv",
'FEED_URI' : 'tmp/stlteachers3.csv'
}
def parse_item(self, response):
#Remove XML namespaces
response.selector.remove_namespaces()
#Extract article information
url = response.url
name = response.xpath('//p[#class="table__title"]/text()').extract()
district = response.xpath('//th[#scope="row"]/following-sibling::td[1]/text()').extract()
for item in zip(name, district):
scraped_info = {
'url' : url,
'name' : item[0],
'district' : item[1],
}
yield scraped_info
Your zip is a bit confusing there. If you want to crawl the whole table then you need to iterate through table rows and find row name and value.
I got pretty good results with this piece of code:
def parse_item(self, response):
name = response.xpath('//p[#class="table__title"]/text()').extract_first()
item = {
'name': name,
'url': response.url
}
for row in response.xpath('//th[#scope="row"]'):
row_name = row.xpath('text()').extract_first('').lower().strip(':')
row_value = row.xpath('following-sibling::td[1]/text()').extract_first()
item[row_name] = row_value
yield item
This returns:
{
'name': 'Bracht, Nathan',
'url': 'http://graphics.stltoday.com/apps/payrolls/salaries/teachers/detail/25074/',
'district': 'Affton 101',
'school': 'Central Office',
'position': 'Central Office Admin.',
'degree earned': 'Doct',
'salary': '$152,000.00',
'extended contract pay': None,
'extra duty pay': None,
'total pay (all combined)': '$152,000.00',
'years in district': '5',
'years in mo schools': '19',
'multiple position detail': None
}
Related
I am trying to scrape a website using python and scrapy but I have issues with saving the result.
error log i receive:
yield result = {
^
SyntaxError: invalid syntax
When i remove the "result = ", I don't get any error but the reason I am doing that is to save the result as a variable which I use at the last part of the code in "f.write(result)"
The code goes below:
import scrapy
class ExampleSpider(scrapy.Spider):
name = "ufcspider"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield result = {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
'link': 'http://quotes.toscrape.com' + quote.css("span a::attr(href)").get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
next_page = response.css("li.next a::attr(href)").get()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callable=self.parse)
page = response.url.split("/")[-2]
filename = f'quotes-{page}.json'
with open(filename, 'wb') as f:
f.write(result)
self.log(f'Saved file {filename}')
First define result, next yield it
result = { ... }
yield result
I'm trying to scrape the Jeopardy archive, using scrapy items. Here is my current spider:
import scrapy
from jeopardy.items import JeopardyItem
from scrapy.loader import ItemLoader
class JeopardySpider(scrapy.Spider):
name = 'clues'
start_urls = ['http://www.j-archive.com/showgame.php?game_id=1']
def parse(self, response):
loader = ItemLoader(item = JeopardyItem(), response=response)
loader.add_xpath('game_id', '//div[#id = "game_title"]//text()')
loader.add_css('value', 'td.clue_value::text,td.clue_value_daily_double::text')
loader.add_value('value', 'FJ')
order_number = response.css('td.clue_order_number').css('a::text').extract()
loader.add_value('order_number', order_number)
loader.add_value('order_number', "61")
loader.add_css('clue_id', 'td.clue_text::attr(id)')
clue = response.css('td.clue').css('td.clue_text').xpath('string()').extract()
loader.add_value('clue', clue)
jep_item = loader.load_item()
answers_url = response.xpath('//*[#id="final_jeopardy_round"]/h4/a[1]//#href').get()
yield response.follow(answers_url, self.parse_answers, meta = {'jep_item': jep_item})
#for a in response.xpath('//*[#id="contestants_table"]//a[#href]'):
#yield response.follow(a, self.parse)
def parse_answers(self, response):
jep_item = response.meta['jep_item']
loader = ItemLoader(item=jep_item, response=response)
answers = response.xpath('//em[#class = "correct_response"]').xpath('string()').extract()
loader.add_value('correct_response', answers)
yield loader.load_item()
Which outputs the following (abridged) JSON:
[
{"game_id": ["Show #4596 - Monday, September 6, 2004"], "value": ["$200", "$200", "$200", ...], "order_number": ["1", "4", "26", ...], "clue_id": ["clue_J_1_1", "clue_J_2_1", "clue_J_3_1"...], "clue": ["Let's all flock to read Psalm 95, in which humans are compared to these animals", "The rap on him is he's sometimes \"Puffy\"", "In the 1980s this city passed Chicago as the USA's second most populous", ...], "correct_response": ["sheep", "Sean Combs", "Los Angeles" ...]}
]
I'm trying to format the data in the following way:
{"game_id": "Show #4596 - Monday, September 6, 2004", "clue_id": "clue_J_1_1", "order_number": "1", "value": "$200", "clue": "Let's all flock to read Psalm 95, in which humans are compared to these animals"},
{"game_id": "Show #4596 - Monday, September 6, 2004", "clue_id": "clue_J_2_1", "order_number": "4", "value": "$200", "clue": "The rap on him is he's sometimes \"Puffy\""}
...
This JSON was scraped using a different spider, but doesn't use items and instead uses regular python dictionaries, and doesn't output the correct response.
import scrapy
class JeopardySpider(scrapy.Spider):
name = 'test'
start_urls = [
'http://www.j-archive.com/showgame.php?game_id=1'
]
#allowed_domains = ['http://www.j-archive.com']
def parse(self, response):
for post in response.css('td.clue'):
yield {
'game_id' : response.xpath('//div[#id = "game_title"]//text()').get(),
'clue_id' : post.css('td.clue_text::attr(id)').get(),
'order_number' : post.css('td.clue_order_number').css('a::text').get(),
'value' : post.css('td.clue_value::text,td.clue_value_daily_double::text').get(),
'clue' : post.css('td.clue_text').xpath('string()').get()
}
answers_url = response.xpath('//*[#id="final_jeopardy_round"]/h4/a[1]//#href').get()
yield response.follow(answers_url, self.parse_answers)
#for a in response.xpath('//*[#id="contestants_table"]//a[#href]'):
#yield response.follow(a, self.parse)
def parse_answers(self, response):
yield {
'correct_response' : response.xpath( '//em[#class= "correct_response"]//text()').get(),
}
I'm looking to do one of three things:
Format the first spider so it outputs a JSON such that each line is a clue, rather than each line a game
Format the second spider so it adds the correct response
Somehow take the first JSON and make it readable in pandas
Also the game ID in the first JSON might complicate things since that list has one element and not 61 like the others, but I'm willing to comment that out if necessary.
I've been tearing my hair out trying to figure this out, so any help/advice would be greatly appreciated.
How can skip one iteration of spider if the webpage contains some data?
Page titles:
We have several page title on pages. I skip other data (dates, likes).
page 1 title: 'We like cats' # this title is valid
page 2 title: 'This title contains WORD X...' # this title is not valid (skip it)
page 3 title: 'Best ideas' # this title is valid
Code:
from scrapy.spider import CrawlSpider
class Carflix(CrawlSpider):
name = 'carflix'
allowed_domains = ['sitex.com']
start_urls = ['http://sitex.com/page-1.html',
'http://sitex.com/page-2.html',
'http://sitex.com/page-2.html']
def parse(self, response):
date = response.xpath('//div[#class="date"]/text()').extract_first()
pagetitle = response.xpath('//div[#class="title"]/text()').extract_first()
if 'WORD X' in pagetitle:
# what need to do that skip adding data if page title contains 'WORD X'
likes = response.xpath('//div[#class="likes"]/text()').extract_first()
yield{
'pagetitle': pagetitle,
'date': date,
'likes': likes,
}
The result should be:
[{
'pagetitle': 'We like cats',
'date': '01/01/2019',
'likes': 200
},
{
'pagetitle': 'Best ideas',
'date': '02/01/2019',
'likes': 100
}]```
Just yield your results under your specified condition:
def parse(self, response):
date = response.xpath('//div[#class="date"]/text()').extract_first()
pagetitle = response.xpath('//div[#class="title"]/text()').extract_first()
likes = response.xpath('//div[#class="likes"]/text()').extract_first()
if not 'WORD X' in pagetitle:
yield {
'pagetitle': pagetitle,
'date': date,
'likes': likes,
}
Here's the part of code that I want to run 4 times. Without counters it works as intended: link to next page is retrieved and scraped for relevant data:
def parse_commits_page(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page)
Here are the variants of the cycle I tried:
Adding a simple global counter:
count = 0
def parse_commits_page(self, response):
global count
while (count < 4):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
count = count + 1
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page)
Adding a sub-function:
def parse_commits_page(self, response):
def grabber( response ):
return {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
yield grabber( response )
for i in range(3):
yield response.follow(
response.xpath('//a[#rel="nofollow"]/#href')[-1].extract(),
callback=grabber
)
In case of counter the response value is updated either once (if placed as in this code) or not at all if count = count + 1 is placed at the end.
In case of sub function response is updated only on the last iteration, resulting in 2 scraped pages instead of 4.
What is the correct way to implement the cycle so that variables are updated as intended?
Here's complete code if that helps(I use 4 defs instead of a cycle right now):
# -*- coding: utf-8 -*-
import scrapy
from random import randint
from time import sleep
BASE_URL = 'https://github.com'
class DiscoverSpider(scrapy.Spider):
name = 'discover_commits_new'
allowed_domains = ['github.com']
start_urls = ['https://github.com/search?utf8=%E2%9C%93&q=stars%3E100&ref=simplesearch']
def parse(self, response):
# Select all the project urls on page
project = BASE_URL + response.xpath('//h3/a[#class="v-align-middle"]/#href').extract_first()
yield response.follow(project, self.parse_project)
# Random wait, so GitHub doesn't ban me right away
sleep(randint(5,20))
# Follow to the next page when every project on this one is scraped
next_page = response.xpath('//a[#rel="next"]/#href').extract_first()
if next_page is not None:
next_page = BASE_URL + next_page
: yield response.follow(next_page, callback=self.parse)
# Parse the main page of the project
def parse_project(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'tags': [x.strip() for x in response.css('.topic-tag::text').extract()],
'lang_name': response.css('.lang::text').extract(),
'lang_perc' : response.css('.percent::text').extract(),
'stars': response.css('.social-count::text').extract()[1].strip(),
'forks': response.css('.social-count::text').extract()[2].strip(),
'commits': response.css('.text-emphasized::text').extract()[0].strip(),
'contributors': response.css('.text-emphasized::text').extract()[3].strip()
}
commits_page = BASE_URL + response.xpath('//*[#class="commits"]//#href').extract_first()
yield response.follow(commits_page, self.parse_commits_page)
# Get last commits
def parse_commits_page(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page1)
def parse_commits_page1(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page2)
def parse_commits_page2(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page3)
def parse_commits_page3(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
So as you are about to see, I am just starting with Python/Scrapy/programming in general. I am trying to figure out how to do multiple form requests in the same spider. I am trying to scrape data from a clerk and recorder’s webpage, but for two (or more) different names. Here is what gets me the first pages of desired results (for the name “Cruz”):
Import scrapy
class LoginSpider(scrapy.Spider):
name = "CRSpider5"
login_url = 'http://recordingsearch.car.elpasoco.com/rsui/opr/search.aspx'
start_urls = [login_url]
def parse(self, response):
validation = response.css('input[name="__EVENTVALIDATION"]::attr(value)').extract_first()
state = response.css('input[name="__VIEWSTATE"]::attr(value)').extract_first()
generator = response.css('input[name="__VIEWSTATEGENERATOR"]::attr(value)').extract_first()
data = {
'__EVENTVALIDATION' : validation,
'__VIEWSTATE' : state,
'__VIEWSTATEGENERATOR' : generator,
'__LASTFOCUS' : '',
'__EVENTTARGET' : '',
'__EVENTARGUMENT' : '',
'ctl00$ContentPlaceHolder1$btnSubmit' : 'Submit+Search',
'ctl00$ContentPlaceHolder1$lbxDocumentTypes' : 'TRANS',
'ctl00$ContentPlaceHolder1$txtGrantorGranteeName' : 'cruz',
}
yield scrapy.FormRequest(url=self.login_url, formdata=data, callback=self.parse_quotes)
def parse_quotes(self, response):
for test in response.css('table#ctl00_ContentPlaceHolder1_gvSearchResults tr')[1:-2]:
yield {
'Debtor': test.css("span::text").extract_first(),
'Creditor': test.css("span::text")[1].extract(),
'Date Recorded': test.css('font::text')[3].extract(),
'Instrument Number': test.css('font::text').extract_first(),
'County': 'El Paso'
}
I would like to do the same thing above but with multiple names (changing the 'ctl00$ContentPlaceHolder1$txtGrantorGranteeName' field to a different name like “smith” or “Jones”). How would I do this in the same spider? Thanks!
if you want to use a random name to start a Formrequest, you can:
import scrapy
import random
class LoginSpider(scrapy.Spider):
name = "CRSpider5"
login_url = 'http://recordingsearch.car.elpasoco.com/rsui/opr/search.aspx'
start_urls = [login_url]
**name = ['smith','Jones']**
def parse(self, response):
validation = response.css('input[name="__EVENTVALIDATION"]::attr(value)').extract_first()
state = response.css('input[name="__VIEWSTATE"]::attr(value)').extract_first()
generator = response.css('input[name="__VIEWSTATEGENERATOR"]::attr(value)').extract_first()
data = {
'__EVENTVALIDATION' : validation,
'__VIEWSTATE' : state,
'__VIEWSTATEGENERATOR' : generator,
'__LASTFOCUS' : '',
'__EVENTTARGET' : '',
'__EVENTARGUMENT' : '',
'ctl00$ContentPlaceHolder1$btnSubmit' : 'Submit+Search',
'ctl00$ContentPlaceHolder1$lbxDocumentTypes' : 'TRANS',
'ctl00$ContentPlaceHolder1$txtGrantorGranteeName' : **random.choice(name)**,
}
yield scrapy.FormRequest(url=self.login_url, formdata=data, callback=self.parse_quotes)
if you want to use different name to start many request,you can loop the 'name' list and yield more request