my output is as follows
0 winner loser
1 winner1
2 loser1
3 winner2
4 loser2
5 winner3
6 loser3
how do I remove the empty cells so that winner and loser values are on the same row? I've tried to locate add new line parameters to pipelines but have no luck. Is there any way to over-ride pipelines to only write if item has a value to the row so the output can be on the same row?
spider.py
import scrapy
from scrapy_splash import SplashRequest
from scrapejs.items import SofascoreItemLoader
from scrapy import Spider
import scrapy
import json
from scrapy.http import Request, FormRequest
class MySpider(scrapy.Spider):
name = "jsscraper"
start_urls = ["https://www.sofascore.com/tennis/2018-02-07"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait':3.5})
def parse(self, response):
for row in response.css('.event-team'):
il = SofascoreItemLoader(selector=row)
il.add_css('winner' , '.event-team:nth-
child(2)::text')
il.add_css('loser' , '.event-team:nth-
child(1)::text')
yield il.load_item()
pipline.py
from scrapy.exporters import CsvItemExporter
class ScrapejsPipeline(object):
def process_item(self, item, spider):
return item
class CsvPipeline(object):
def __init__(self):
self.file = open("quotedata2.csv", 'w+b')
self.exporter = CsvItemExporter(self.file, str)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose,
from operator import methodcaller
from scrapy import Spider, Request, Selector
class SofascoreItem(scrapy.Item):
loser = scrapy.Field()
winner = scrapy.Field()
#date = scrapy.Field()
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
Check this one, the problem is located: https://stackoverflow.com/a/48859488/9270398
Related
I believe that I have my xpaths coded in the incorrect way, as I only get a single result for each url. Whereas, there are in total 25 job posts for each url (not included those in the next page.) How can I correct my xpaths to get all the results?
Here's my scraper:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
callback = self.parse,
cb_kwargs = {
'items':items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]')
for lists in container:
loader = ItemLoader(CvItem(), selector = lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()
There was a slight mistake with the requests that I updated for those of you that had checked for the first 15minutes since I uploaded it.
The problem was in the container's xpath. You only get the container without actually the items in it so you only loop once on the container itself and not the actual items you want to scrape.
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
class CvItem(scrapy.Item):
category = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
title = Field(output_processor=TakeFirst())
organisation = Field(output_processor=TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
start_urls = {'Accountancy_finance': ['https://www.cv-library.co.uk/Degree-Finance-jobs?us=1',
'https://www.cv-library.co.uk/Degree-Accounting-jobs?us=1'],
'Aeronautical_Engineering': ['https://www.cv-library.co.uk/Degree-Aeronautical-Engineering-jobs?us=1'],
'Manufacturing_Engineering': ['https://www.cv-library.co.uk/Degree-Manufacturing-Engineering-jobs?us=1'],
'Agriculture_and_Forestry': ['https://www.cv-library.co.uk/Degree-Forestry-jobs?us=1']}
def start_requests(self):
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url=url,
cb_kwargs={
'items': items
}
)
def parse(self, response, items):
container = response.xpath('//ol[#id="searchResults"]//li[#class="results__item"]')
for lists in container:
loader = ItemLoader(CvItem(), selector=lists)
loader.add_value('category', items)
loader.add_xpath('title', '//article[#id]//a[#title]/#title')
loader.add_xpath('salary', '//article[#id]//dl//dd[#class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//article[#id]/div//div/p/a//text()')
yield loader.load_item()
My scrapy crawler collect data from ptt website, and input the crawling data into google spreadsheet by using gspread. my ptt spider parse latest 40 post on the ptt website everyday, and now i would like to drop duplicate data in this latest 40 post, for example, if the post_title or post_link is the same with yesterday, then don't need to parse this post into google spreadsheet.
i know i should use DropItem in scarpy, but literally i didn't know how to fix my code( i am a very new beginner in Python), and would like ask for help for this one, thanks.
This is my ppt spider code
# -*- coding: utf-8 -*-
import scrapy
# from scrapy.exceptions import CloseSpider
from myFirstScrapyProject.items import MyfirstscrapyprojectItem
class PttSpider(scrapy.Spider):
count_page = 1
name = 'ptt'
allowed_domains = ['www.ptt.cc/']
start_urls = ['https://www.ptt.cc/bbs/e-shopping/search?q=%E8%9D%A6%E7%9A%AE']+['https://www.ptt.cc/bbs/e-seller/search?q=%E8%9D%A6%E7%9A%AE']
# start_urls = ['https://www.ptt.cc/bbs/e-shopping/index.html']
def parse(self, response):
items = MyfirstscrapyprojectItem()
for q in response.css('div.r-ent'):
items['push']=q.css('div.nrec > span.h1::text').extract_first()
items['title']=q.css('div.title > a::text').extract_first()
items['href']=q.css('div.title> a::attr(href)').extract_first()
items['date']=q.css('div.meta > div.date ::text').extract_first()
items['author']=q.css('div.meta > div.author ::text').extract_first()
yield(items)
and this is my pipeline
from myFirstScrapyProject.exporters import GoogleSheetItemExporter
from scrapy.exceptions import DropItem
class MyfirstscrapyprojectPipeline(object):
def open_spider(self, spider):
self.exporter = GoogleSheetItemExporter()
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
thanks to sharmiko, i rewrite it, but it seems doesn't work, what should i fix?
from myFirstScrapyProject.exporters import GoogleSheetItemExporter
from scrapy.exceptions import DropItem
class MyfirstscrapyprojectPipeline(object):
def open_spider(self, spider):
self.exporter = GoogleSheetItemExporter()
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
# def process_item(self, item, spider):
# self.exporter.export_item(item)
# return item
#class DuplicatesTitlePipeline(object):
def __init__(self):
self.article = set()
def process_item(self, item, spider):
href = item['href']
if href in self.article:
raise DropItem('duplicates href found %s', item)
self.exporter.export_item(item)
return(item)
this is the code for export to google sheet
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from scrapy.exporters import BaseItemExporter
class GoogleSheetItemExporter(BaseItemExporter):
def __init__(self):
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('pythonupload.json', scope)
gc = gspread.authorize(credentials)
self.spreadsheet = gc.open('Community')
self.worksheet = self.spreadsheet.get_worksheet(1)
def export_item(self, item):
self.worksheet.append_row([item['push'], item['title'],
item['href'],item['date'],item['author']])
You should modify your process_item function to check for duplicate elements and if it is found, you can just drop it.
from scrapy.exceptions import DropItem
...
def process_item(self, item, spider):
if [ your duplicate check logic goes here]:
raise DropItem('Duplicate element found')
else:
self.exporter.export_item(item)
return item
Dropped items are no longer passed to other pipeline components. You can read more about pipelines here.
I create a scrapy project to scrape a few information off this classifieds website, however the data I was getting needed to be formatted. After doing some research I figured out how to implement an ItemLoader but now it does not write any scraped data to the csv file.
Here's my spider.py:
import scrapy
from..items import TestItem
from scrapy.loader import ItemLoader
class TestSpiderSpider(scrapy.Spider):
name = 'test'
page_number = 2
start_urls = ['https://jamaicaclassifiedonline.com/auto/cars/']
def parse(self, response):
for car in response.css('.col.l3.s12.m6'):
items = TestItem()
product_title = car.css('.jco-card-title::text').extract()
product_imagelink = car.css('.card-image img::attr(data-src)').getall()
urls = car.css('.card-image a::attr(href)').getall()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
if product_title and product_imagelink:
items['urls'] = urls
def parse_details(self, response):
l= ItemLoader(item=TestItem(), selector=response)
l.add_css('product_title','#title::text')
yield l.load_item()
pass
Here's my items.py
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst
from w3lib.html import remove_tags
class TestItem(scrapy.Item):
product_title = scrapy.Field(input_processors= MapCompose(remove_tags),output_processor= TakeFirst())
pass
Here's my setting.py:
BOT_NAME = 'test'
SPIDER_MODULES = ['test.spiders']
NEWSPIDER_MODULE = 'test.spiders'
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_user_agents.middlewares.RandomUserAgentMiddleware': 400,
}
Here's my pipeline.py:
class TestPipeline:
def process_item(self, item, spider):
return item
You don't need pipelines enabled to use ItemLoader, try without.
I want to remove the [ ] brackets scrapy adds to all it's output, to do this you simply add [0] at the end of an xpath statement as follows:
'a[#class="question-hyperlink"]/text()').extract()[0]
this solves the [ ] problem in some cases but in other cases scrapy returns every second row of output as blank and as such the moment it gets to the second row when using [0] i'm given the error:
Index error: list index out of range
How can I prevent scrapy from creating blank rows ? It seems like this is a common problem, but everyone faces this problem when exporting to CSV while for me it's with the scrapy response before exporting as CSV.
Items.py:
import scrapy
from scrapy.item import Item, Field
class QuestionItem(Item):
title = Field()
url = Field()
class PopularityItem(Item):
votes = Field()
answers = Field()
views = Field()
class ModifiedItem(Item):
lastModified = Field()
modName = Field()
The spider that doesn't output every second row as blank and thus works with [0]:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import QuestionItem
class QuestionSpider(Spider):
name = "questions"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"http://stackoverflow.com/questions?pagesize=50&sort=newest",
]
def parse(self, response):
questions = Selector(response).xpath('//div[#class="summary"]/h3')
for question in questions:
item = QuestionItem()
item['title'] = question.xpath(
'a[#class="question-hyperlink"]/text()').extract()[0]
item['url'] = question.xpath(
'a[#class="question-hyperlink"]/#href').extract()[0]
yield item
The spider that gives every second row of output as blank:
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import PopularityItem
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"https://stackoverflow.com/",
]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
item = PopularityItem()
item['votes'] = poppart.xpath(
'div[contains(#class, "votes")]//span/text()').extract()#[0]
item['answers'] = poppart.xpath(
'div[contains(#class, "answered")]//span/text()').extract()#[0]
item['views'] = poppart.xpath(
'div[contains(#class, "views")]//span/text()').extract()#[0]
yield item
Pipelines.py
import pymongo
import logging
class StackPipeline(object):
def process_item(self, item, spider):
return item
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
self.db = connection[settings['MONGODB_DB']]
def process_item(self, item, spider):
collection = self.db[type(item).__name__.lower()]
logging.info(collection.insert(dict(item)))
return item
The easiest way to handle an error like this is to catch it and deal with it then (in this case, by just moving on past the blank lines).
class PopularitySpider(Spider):
name = "popularity"
allowed_domains = ["stackoverflow.com"]
start_urls = ["https://stackoverflow.com/"]
def parse(self, response):
popularity = response.xpath('//div[contains(#class, "question-summary narrow")]/div')
for poppart in popularity:
try:
item = PopularityItem()
item['votes'] = poppart.xpath('div[contains(#class, "votes")]//span/text()').extract()[0]
item['answers'] = poppart.xpath('div[contains(#class, "answered")]//span/text()').extract()[0]
item['views'] = poppart.xpath('div[contains(#class, "views")]//span/text()').extract()[0]
except IndexError:
continue
yield item
I'd like to collect an item's data from 2 different websites.
It should work as follows:
parse_website_1 fetches a persons name from website_1 and populates
the item
parse_website_1 yields a request for parse_website_2
parse_website_2 parses website_2, collects the persons hair-color, based on the person name which was scraped from website_1 and populates the item
parse_website_2 loads the item
Would this be in the right direction, given that the item is defined in items.py:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def __init__(self):
self.item = ItemLoader(item=MyItem(), response=response)
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
name = response.xpath('//div[#class="name"]/text()').extract_first()
self.item.add_value("name", name)
website_2_path = "http://website_2.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2)
def parse_website_2(self, response):
self.item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield self.item.load_item()
The idea is right, but the implementation is not correct in that you are trying to pass data between consecutive requests using an instance attribute (self.item). Scrapy requests are asynchronous so it would not work as expected.
The correct way how to do it is outlined in Scrapy FAQ. Pass the partial item data to consecutive request using Request's meta attribute where you obtain it using Response's meta attribute, add some more data and finally yield the item. Here's the adapted code:
import scrapy
from scrapy.loader import ItemLoader
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = "myspider"
def start_requests(self):
scrapy.Request(url="http://website_1.com", callback=self.parse_website_1)
def parse_website_1(self, response):
item = ItemLoader(item=MyItem(), response=response)
name = response.xpath('//div[#class="name"]/text()').extract_first()
item.add_value("name", name)
website_2_path = "http://website_1.com/" + name
yield scrapy.Request(url=website_2_path, callback=self.parse_website_2, meta={'item': item})
def parse_website_2(self, response):
item = response.meta['item']
item.add_xpath("hair_color", '//div[#class="hair_color"]')
yield item.load_item()