How to export scrapped data using FEEDS/FEED EXPORTS in scrapy - python

I'm new to webscraping/scrapy and python
Scrapy version: Scrapy 2.5.1
OS: windows
IDE: pycharm
I am trying to use FEEDS option in scrapy to automatically export the scrapped data from a website to download into excel
Tried following solution but didn't work stackoverflow solution not sure what i'm doing wrong here am i missing something?
i also tried to add the same in my settings.py file after commenting custom_settings in my spider class as per example provided in documentation: https://docs.scrapy.org/en/latest/topics/feed-exports.html?highlight=feed#feeds
for now i achieved my requirement using spider_closed (signal) to write data to CSV by storing all the scraped items data in a array called result
class SpiderFC(scrapy.Spider):
name = "FC"
start_urls = [
url,
]
custom_setting = {"FEEDS": {r"C:\Users\rreddy\PycharmProjects\fcdc\webscrp\outputfinal.csv": {"format": "csv", "overwrite": True}}}
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(SpiderFC, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def __init__(self, name=None):
super().__init__(name)
self.count = None
def parse(self, response, **kwargs):
# each item scrapped from parent page has links where the actual data need to be scrapped so i follow each link and scrape data
yield response.follow(notice_href_follow, callback=self.parse_item,
meta={'item': item, 'index': index, 'next_page': next_page})
def parse_item(self, response):
# logic for items to scrape goes here
# they are saved to temp list and appended to result array and then temp list is cleared
result.append(it) # result data is used at the end to write to csv
item.clear()
if next_page:
yield next(self.follow_next(response, next_page))
def follow_next(self, response, next_page):
next_page_url = urljoin(url, next_page[0])
yield response.follow(next_page_url, callback=self.parse)
spider closed signal
def spider_closed(self, spider):
with open(output_path, mode="a", newline='') as f:
writer = csv.writer(f)
for v in result:
writer.writerow([v["city"]])
when all data is scraped and all requests are completed spider_closed signal will write the data to a csv but i'm trying to avoid this logic or code and use inbuilt exporter from scrapy but I'm having trouble in exporting the data

Check your path. If you are on windows then provide the full path in the custom_settings e.g. as below
custom_settings = {
"FEEDS":{r"C:\Users\Name\Path\To\outputfinal.csv" : {"format" : "csv", "overwrite":True}}
}
If you are on linux or MAC then provide the path as below:
custom_settings = {
"FEEDS":{r"/Path/to/folder/fcdc/webscrp/outputfinal.csv" : {"format" : "csv", "overwrite":True}}
}
Alternatively provide the relative path as below which will create a folder structure of fcdc>>webscrp>>outputfinal.csv in the directory from which the spider is run from.
custom_settings = {
"FEEDS":{r"./fcdc/webscrp/outputfinal.csv" : {"format" : "csv", "overwrite":True}}
}

Related

Scrapy Crawler:Avoid Duplicate Crawling of URLs

I have created a crawler using Scrapy.The crawler is crawling the website fetching the URL.
Technology Used:Python Scrapy
Issue:I am having duplication of URLs.
What I need the output to be:
I want the crawler to crawl the website and fetch the URL's but not crawl the duplicate URL's.
Sample Code:
I have added this code to my settings.py file.
DUPEFILTER_CLASS ='scrapy.dupefilter.RFPDupeFilter'
I ran the file its says module not found.
import scrapy
import os
import scrapy.dupefilters
class MySpider(scrapy.Spider):
name = 'feed_exporter_test'
# this is equivalent to what you would set in settings.py file
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'inputLinks2.csv'
}
filePath='inputLinks2.csv'
if os.path.exists(filePath):
os.remove(filePath)
else:
print("Can not delete the file as it doesn't exists")
start_urls = ['https://www.mytravelexp.com/']
def parse(self, response):
titles = response.xpath("//a/#href").extract()
for title in titles:
yield {'title': title}
def __getid(self, url):
mm = url.split("&refer")[0] #or something like that
return mm
def request_seen(self, request):
fp = self.__getid(request.url)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
if self.file:
self.file.write(fp + os.linesep)
Please help!!
Scrapy filters duplicate requests by default.

How to run multiple spiders through individual pipelines?

Total noob just getting started with scrapy.
In my directory structure I have like this...
#FYI: running on Scrapy 2.4.1
WebScraper/
Webscraper/
spiders/
spider.py # (NOTE: contains spider1 and spider2 classes.)
items.py
middlewares.py
pipelines.py # (NOTE: contains spider1Pipeline and spider2Pipeline)
settings.py # (NOTE: I wrote here:
#ITEM_PIPELINES = {
# 'WebScraper.pipelines.spider1_pipelines': 300,
# 'WebScraper.pipelines.spider2_pipelines': 300,
#}
scrapy.cfg
And spider2.py resembles...
class OneSpider(scrapy.Spider):
name = "spider1"
def start_requests(self):
urls = ["url1.com",]
yield scrapy.Request(
url="http://url1.com",
callback=self.parse
)
def parse(self,response):
## Scrape stuff, put it in a dict
yield dictOfScrapedStuff
class TwoSpider(scrapy.Spider):
name = "spider2"
def start_requests(self):
urls = ["url2.com",]
yield scrapy.Request(
url="http://url2.com",
callback=self.parse
)
def parse(self,response):
## Scrape stuff, put it in a dict
yield dictOfScrapedStuff
With pipelines.py looking like...
class spider1_pipelines(object):
def __init__(self):
self.csvwriter = csv.writer(open('spider1.csv', 'w', newline=''))
self.csvwriter.writerow(['header1', 'header2'])
def process_item(self, item, spider):
row = []
row.append(item['header1'])
row.append(item['header2'])
self.csvwrite.writerow(row)
class spider2_pipelines(object):
def __init__(self):
self.csvwriter = csv.writer(open('spider2.csv', 'w', newline=''))
self.csvwriter.writerow(['header_a', 'header_b'])
def process_item(self, item, spider):
row = []
row.append(item['header_a']) #NOTE: this is not the same as header1
row.append(item['header_b']) #NOTE: this is not the same as header2
self.csvwrite.writerow(row)
I have a question about running spider1 and spider2 on different urls with one terminal command:
nohup scrapy crawl spider1 -o spider1_output.csv --logfile spider1.log & scrapy crawl spider2 -o spider2_output.csv --logfile spider2.log
Note: this is an extension of a previous question specific to this stack overflow post (2018).
Desired result: spider1.csv with data from spider1, spider2.csv with data from spider2.
Current result: spider1.csv with data from spider1, spider2.csv BREAKS but error log contains spider2 data, and that there was a keyerror ['header1'], even though the item for spider2 does not include header1, it only includes header_a.
Does anyone know how to run one spider after the other on different urls, and plug data fetched by spider1, spider2, etc. into pipelines specific to that spider, as in spider1 -> spider1Pipeline -> spider1.csv, spider2 -> spider2Pipelines -> spider2.csv.
Or perhaps this is a matter of specifying the spider1_item and spider2_item from items.py? I wonder if I can specify where to insert spider2's data that way.
Thank you!
You can implement this using custom_settings spider attribute to set settings individually per spider
#spider2.py
class OneSpider(scrapy.Spider):
name = "spider1"
custom_settings = {
'ITEM_PIPELINES': {'WebScraper.pipelines.spider1_pipelines': 300}
...
class TwoSpider(scrapy.Spider):
name = "spider2"
custom_settings = {
'ITEM_PIPELINES': {'WebScraper.pipelines.spider2_pipelines': 300}
...

How to deal with redirects to a bookmark within a page in Scrapy (911 error)

I am very new to programming, so apologies if this is a rookie issue. I am a researcher, and I've been building spiders to allow me to crawl specific search results of IGN, the gaming forum. The first spider collects each entry in the search results, along with URLs, and then the second spider crawls each of those URLs for the content.
The problem is that IGN redirects URLs associated with a specific post to a new URL that incorporates a #bookmark at the end of the address. This allows the visitor to the page to jump directly down to the post in question, but I want my spider to crawl over the entire thread. In addition, my spider ends up with a (911) error after the redirect and returns no data. The only data retrieved is from any search results that linked directly to a thread rather than a post.
I am absolutely stumped and confused, so any help would amazing! Both spiders are attached below.
Spider 1:
myURLs = [] baselineURL = "https://www.ign.com/boards/search/186716896/?q=broforce&o=date&page=" for counter in range (1,5):
myURLs.append(baselineURL + str(counter))
class BroforceIGNScraper(scrapy.Spider):
name = "foundation"
start_urls = myURLs
def parse(self,response):
for post in response.css("div.main"):
yield {
'title': post.css("h3.title a::text").extract_first(),
'author': post.css("div.meta a.username::text").extract_first(),
'URL': post.css('h3 a').xpath('#href').extract_first(),
}
Spider 2:
URLlist = []
baseURL = "https://www.ign.com/boards/"
import csv
with open('BroforceIGNbase.csv', 'r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
URLlist.append(baseURL + row['URL'])
class BroforceIGNScraper(scrapy.Spider):
name = "posts2"
start_urls = URLlist
# handle_httpstatus_list = [301]
def parse(self,response):
for post in response.css(".messageList"):
yield {
'URL': response.url,
'content': post.css(".messageContent article").extract_first(),
'commentauthor': post.css("div.messageMeta a::text").extract_first(),
'commentDateTime': post.css('div.messageMeta a span.DateTime').xpath('#title').extract_first(),
}

Scraping Japanese website using Scrapy but no data in output file

I am new to Scrapy. I wanted some data scraped from a Japanese website but when I run the following spider, it won't show any data on the exported file. Can someone help me please.
Exporting to csv format doesn't show any results in the shell either, just [].
Here is my code.
import scrapy
class suumotest(scrapy.Spider):
name = "testsecond"
start_urls = [
'https://suumo.jp/jj/chintai/ichiran/FR301FC005/?tc=0401303&tc=0401304&ar=010&bs=040'
]
def parse(self, response):
# for following property link
for href in response.css('.property_inner-title+a::attr(href)').extract():
yield scrapy.Request(response.urljoin(href), callback=self.parse_info)
# defining parser to extract data
def parse_info(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
yield {
'Title': extract_with_css('h1.section_title::text'),
'Fee': extract_with_css('td.detailinfo-col--01 span.detailvalue-item-accent::text'),
'Fee Descrition': extract_with_css('td.detailinfo-col--01 span.detailvalue-item-text::text'),
'Prop Description': extract_with_css('td.detailinfo-col--03::text'),
'Prop Address': extract_with_css('td.detailinfo-col--04::text'),
}
Your first css selector in parse method is faulty here:
response.css('.property_inner-title+a::attr(href)').extract()
+ is the fault here. Just replace it with a space, like:
response.css('.property_inner-title a::attr(href)').extract()
Another issue is in your defined extract_with_css() function:
def parse_info(self, response):
def extract_with_css(query):
return response.css(query).extract_first().strip()
The problem here is that extract_first() will return None by default if no values are found and .strip() is a function of string base class, since you're not getting a string this will throw an error.
To fix that you can set default value to extract_first to be an empty string instead:
def parse_info(self, response):
def extract_with_css(query):
return response.css(query).extract_first('').strip()

How to add try exception in scrapy spider?

I build a simple crawler application by using urllib2 and beautifulsoup, now i am planning to change it into scrapy spider, but how i can handle errors while running crawler,
My current application have some code like this,
error_file = open('errors.txt','a')
finish_file = open('finishlink.txt','a')
try:
#Code for process each links
#if sucessfully finished link store into 'finish.txt' file
except Exception as e:
#link write into 'errors.txt' file with error code
so when i am processing thousands of links, the successfully processed links will store into finish.txt and error's will be in errors.txt, so i can run links in errors later until successfully processed.
So how i can accomplish these in these code,
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
filename = response.url.split("/")[-2]
with open('filename+'.txt', 'wb') as f:
f.write(response.body)
You can create a spider middleware and override the process_spider_exception() method, saving the links in a file there.
A spider middleware is just a way for you to extend Scrapy's behavior.
Here is a full example that you can modify as needed for your purpose:
from scrapy import signals
class SaveErrorsMiddleware(object):
def __init__(self, crawler):
crawler.signals.connect(self.close_spider, signals.spider_closed)
crawler.signals.connect(self.open_spider, signals.spider_opened)
#classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def open_spider(self, spider):
self.output_file = open('somefile.txt', 'a')
def close_spider(self, spider):
self.output_file.close()
def process_spider_exception(self, response, exception, spider):
self.output_file.write(response.url + '\n')
Put this in a module and set it up in settings.py:
SPIDER_MIDDLEWARES = {
'myproject.middleware.SaveErrorsMiddleware': 1000,
}
This code will run together with your spider, triggering the open_spider(), close_spider(), process_spider_exception() methods when appropriated.
Read more:
Spider Middlewares
Signals in Scrapy
Example middleware in Scrapy source code

Categories

Resources