I believe that I have my xpaths coded in the incorrect way, as I only get a single result for each url. Whereas, there are in total 25 job posts for each url (not included those in the next page.) How can I correct my xpaths to get all the results?
Here's my scraper:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
callback = self.parse,
cb_kwargs = {
'items':items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]')
for lists in container:
loader = ItemLoader(CvItem(), selector = lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()
There was a slight mistake with the requests that I updated for those of you that had checked for the first 15minutes since I uploaded it.
The problem was in the container's xpath. You only get the container without actually the items in it so you only loop once on the container itself and not the actual items you want to scrape.
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
title = Field(output_processor=TakeFirst())
organisation = Field(output_processor=TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url=url,
cb_kwargs={
'items': items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]//li[#class="results__item"]')
for lists in container:
loader = ItemLoader(CvItem(), selector=lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()
I create a scrapy project to scrape a few information off this classifieds website, however the data I was getting needed to be formatted. After doing some research I figured out how to implement an ItemLoader but now it does not write any scraped data to the csv file.
Here's my spider.py:
import scrapy
from..items import TestItem
from scrapy.loader import ItemLoader
class TestSpiderSpider(scrapy.Spider):
name = 'test'
page_number = 2
start_urls = ['https://jamaicaclassifiedonline.com/auto/cars/']
def parse(self, response):
for car in response.css('.col.l3.s12.m6'):
items = TestItem()
product_title = car.css('.jco-card-title::text').extract()
product_imagelink = car.css('.card-image img::attr(data-src)').getall()
urls = car.css('.card-image a::attr(href)').getall()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
if product_title and product_imagelink:
items['urls'] = urls
def parse_details(self, response):
l= ItemLoader(item=TestItem(), selector=response)
l.add_css('product_title','#title::text')
yield l.load_item()
pass
Here's my items.py
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst
from w3lib.html import remove_tags
class TestItem(scrapy.Item):
product_title = scrapy.Field(input_processors= MapCompose(remove_tags),output_processor= TakeFirst())
pass
Here's my setting.py:
BOT_NAME = 'test'
SPIDER_MODULES = ['test.spiders']
NEWSPIDER_MODULE = 'test.spiders'
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
}
Here's my pipeline.py:
class TestPipeline:
def process_item(self, item, spider):
return item
You don't need pipelines enabled to use ItemLoader, try without.
I can't seem to get the ItemLoader to work. I don't get any errors in the scrapy log, just nothing gets extracted. Any ideas would be helpful!
import scrapy
from medium.items import MediumItem
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider
import logging
from scrapy.utils.log import configure_logging
class DataSpider(CrawlSpider):
custom_settings = {
'LOG_FILE': 'my_log.log',
'LOG_LEVEL': 'ERROR'}
logging.getLogger().addHandler(logging.StreamHandler())
name = 'data'
allowed_domains = ['medium.com', 'towardsdatascience.com']
start_urls = ['https://medium.com/tag/python/archive/02/01']
handle_httpstatus_list = [302]
def parse(self,response):
articles = response.xpath('//div[#class="postArticle postArticle--short js-postArticle js-
trackPostPresentation js-trackPostScrolls"]')
for article in articles:
if article.xpath('.//a[#class="button button--smaller button--chromeless u-baseColor--
buttonNormal"]/#href').extract_first():
l = ItemLoader(item = MediumItem(), selector = article)
l.default_output_processor = scrapy.loader.processors.TakeFirst()
l.add_css('Title','div > h3::text')
l.add_xpath('Name','.//a[#class="ds-link ds-link--styleSubtle link link--darken link-
-accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
l.add_css('Read','span::attr(title)')
l.add_xpath('Publication', './/a[#class="ds-link ds-link--styleSubtle link--darken
link--accent u-accentColor--textNormal"]/text()')
l.add_xpath('Claps','.//button[#class="button button--chromeless u-baseColor--
buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
l.add_xpath('Responses','.//a[#class="button button--chromeless u-baseColor--
buttonNormal"]/text()')
l.add_value('Page',response.url)
yield l.load_item()
The Items file is
import scrapy
from scrapy.item import Item, Field
class MediumItem(Item):
Title = scrapy.Field()
Name = scrapy.Field()
Date = scrapy.Field()
Read = scrapy.Field()
Publication = scrapy.Field()
Claps = scrapy.Field()
Responses = scrapy.Field()
At start I get two problems
it needs
Page = scrapy.Field()
page https://medium.com/tag/python/archive/02/01 is redirected to https://medium.com/tag/python/archive but it is blocked by
handle_httpstatus_list = [302]
After removing handle_httpstatus_list I get data from first page
Result (csv)
Claps,Date,Name,Page,Publication,Read,Responses,Title
81K,,Daniel van Flymen,https://medium.com/tag/python/archive,,9 min read,383 responses,Learn Blockchains by Building One
25K,,Jonny Fox,https://medium.com/tag/python/archive,,6 min read,63 responses,Regex tutorial — A quick cheatsheet by examples
9.6K,,Susan Li,https://medium.com/tag/python/archive,,9 min read,112 responses,"Building A Logistic Regression in Python, Step by Step"
5.8K,,Adi Bronshtein,https://medium.com/tag/python/archive,,9 min read,46 responses,Train/Test Split and Cross Validation in Python
7.8K,,Will Koehrsen,https://medium.com/tag/python/archive,,21 min read,42 responses,Random Forest in Python
7.2K,,Ted Petrou,https://medium.com/tag/python/archive,,24 min read,34 responses,Selecting Subsets of Data in Pandas: Part 1
11.1K,,Milo Spencer-Harper,https://medium.com/tag/python/archive,,6 min read,86 responses,How to build a simple neural network in 9 lines of Python code
5.2K,,Michael Galarnyk,https://medium.com/tag/python/archive,,8 min read,27 responses,PCA using Python (scikit-learn)
64K,,TK,https://medium.com/tag/python/archive,,11 min read,148 responses,Learning Python: From Zero to Hero
6.9K,,Susan Li,https://medium.com/tag/python/archive,,9 min read,75 responses,An End-to-End Project on Time Series Analysis and Forecasting with Python
Code which I used - all in one file without creating project
import scrapy
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider
import logging
from scrapy.utils.log import configure_logging
class MediumItem(scrapy.Item):
Title = scrapy.Field()
Name = scrapy.Field()
Date = scrapy.Field()
Read = scrapy.Field()
Publication = scrapy.Field()
Claps = scrapy.Field()
Responses = scrapy.Field()
Page = scrapy.Field()
class DataSpider(CrawlSpider):
custom_settings = {
'LOG_FILE': 'my_log.log',
'LOG_LEVEL': 'ERROR'}
logging.getLogger().addHandler(logging.StreamHandler())
name = 'data'
allowed_domains = ['medium.com', 'towardsdatascience.com']
start_urls = ['https://medium.com/tag/python/archive/02/01']
#handle_httpstatus_list = [302]
def parse(self,response):
print('url:', response.url)
articles = response.xpath('//div[#class="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls"]')
for article in articles:
if article.xpath('.//a[#class="button button--smaller button--chromeless u-baseColor--buttonNormal"]/#href').extract_first():
l = ItemLoader(item = MediumItem(), selector = article)
l.default_output_processor = scrapy.loader.processors.TakeFirst()
l.add_css('Title','div > h3::text')
l.add_xpath('Name','.//a[#class="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken"]/text()')
l.add_css('Read','span::attr(title)')
l.add_xpath('Publication', './/a[#class="ds-link ds-link--styleSubtle link--darkenlink--accent u-accentColor--textNormal"]/text()')
l.add_xpath('Claps','.//button[#class="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents"]/text()')
l.add_xpath('Responses','.//a[#class="button button--chromeless u-baseColor--buttonNormal"]/text()')
l.add_value('Page', response.url)
yield l.load_item()
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(DataSpider)
c.start()
I was going to scrap the commentary from espncricnfo website using scrapy and i got output(items.csv) as blank. These are my files.
cricinfo.py (Spider File)
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from crictest.items import CrictestItem
class MySpider(BaseSpider):
name = "cricinfo"
allowed_domains = ["espncricinfo.com/"]
start_urls = ["http://www.espncricinfo.com/champions-league-twenty20-2014/engine/match/763595.html?innings=1;view=commentary/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//td[#class="battingComms" and b]')
for row in rows:
item = CrictestItem()
item['overnum'] = row.select('b/text()').extract()[0]
item['overnumtext'] = row.select('b/following-sibling::text()').extract()[0]
yield item
items.py
import scrapy
class CrictestItem(scrapy.Item):
overnum = scrapy.Field()
overnumtext = scrapy.Field()
the problem is your xpath
you can try using this in chrome:
$x('//*[#id="commInnings"]/div[2]/div/div')
in your code rewrite the code in:
rows = hxs.select('//td[#class="battingComms" and b]')
i can't get any output in the console
My goal is to extract all 25 rows ( 6 items per row) per page then iterate over each of the 40 pages.
Currently, my spider extracts the first row from page 1-3 (see CSV output image).
I assumed the, list_iterator() function would iterate over each row; however, there appears to be an error in either my rules or list_iterator() function that is not allowing all rows per page to be scrapped.
Any assistance or advice is greatly appreciated!
propub_spider.py:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from propub.items import PropubItem
from scrapy.http import Request
class propubSpider(CrawlSpider):
name = 'prop$'
allowed_domains = ['https://projects.propublica.org']
max_pages = 40
start_urls = [
'https://projects.propublica.org/docdollars/search?state%5Bid%5D=33',
'https://projects.propublica.org/docdollars/search?page=2&state%5Bid%5D=33',
'https://projects.propublica.org/docdollars/search?page=3&state%5Bid%5D=33']
rules = (Rule(SgmlLinkExtractor(allow=('\\search?page=\\d')), 'parse_start_url', follow=True),)
def list_iterator(self):
for i in range(self.max_pages):
yield Request('https://projects.propublica.org/docdollars/search?page=d' % i, callback=self.parse)
def parse(self, response):
for sel in response.xpath('//*[#id="payments_list"]/tbody'):
item = PropubItem()
item['payee'] = sel.xpath('tr[1]/td[1]/a[2]/text()').extract()
item['link'] = sel.xpath('tr[1]/td[1]/a[1]/#href').extract()
item['city'] = sel.xpath('tr[1]/td[2]/text()').extract()
item['state'] = sel.xpath('tr[1]/td[3]/text()').extract()
item['company'] = sel.xpath('tr[1]/td[4]').extract()
item['amount'] = sel.xpath('tr[1]/td[7]/span/text()').extract()
yield item
pipelines.py:
import csv
class PropubPipeline(object):
def __init__(self):
self.myCSV = csv.writer(open('C:\Users\Desktop\propub.csv', 'wb'))
self.myCSV.writerow(['payee', 'link', 'city', 'state', 'company', 'amount'])
def process_item(self, item, spider):
self.myCSV.writerow([item['payee'][0].encode('utf-8'),
item['link'][0].encode('utf-8'),
item['city'][0].encode('utf-8'),
item['state'][0].encode('utf-8'),
item['company'][0].encode('utf-8'),
item['amount'][0].encode('utf-8')])
return item
items.py:
import scrapy
from scrapy.item import Item, Field
class PropubItem(scrapy.Item):
payee = scrapy.Field()
link = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
company = scrapy.Field()
amount = scrapy.Field()
pass
CSV output:
Multiple things need to be fixed:
use start_requests() method instead of list_iterator()
there is a missing % here:
yield Request('https://projects.propublica.org/docdollars/search?page=%d' % i, callback=self.parse)
# HERE^
you don't need CrawlSpider since you are providing the pagination links via start_requests() - use regular scrapy.Spider
it would more reliable if XPath expressions would match the cells by class attributes
Fixed version:
import scrapy
from propub.items import PropubItem
class propubSpider(scrapy.Spider):
name = 'prop$'
allowed_domains = ['projects.propublica.org']
max_pages = 40
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('https://projects.propublica.org/docdollars/search?page=%d' % i, callback=self.parse)
def parse(self, response):
for sel in response.xpath('//*[#id="payments_list"]//tr[#data-payment-id]'):
item = PropubItem()
item['payee'] = sel.xpath('td[#class="name_and_payee"]/a[last()]/text()').extract()
item['link'] = sel.xpath('td[#class="name_and_payee"]/a[1]/#href').extract()
item['city'] = sel.xpath('td[#class="city"]/text()').extract()
item['state'] = sel.xpath('td[#class="state"]/text()').extract()
item['company'] = sel.xpath('td[#class="company"]/text()').extract()
item['amount'] = sel.xpath('td[#class="amount"]/text()').extract()
yield item