Scrapy Crawler:Avoid Duplicate Crawling of URLs

Scrapy Crawler:Avoid Duplicate Crawling of URLs - python

I have created a crawler using Scrapy.The crawler is crawling the website fetching the URL.
Technology Used:Python Scrapy
Issue:I am having duplication of URLs.
What I need the output to be:
I want the crawler to crawl the website and fetch the URL's but not crawl the duplicate URL's.
Sample Code:
I have added this code to my settings.py file.
DUPEFILTER_CLASS ='scrapy.dupefilter.RFPDupeFilter'
I ran the file its says module not found.
import scrapy
import os
import scrapy.dupefilters
class MySpider(scrapy.Spider):
name = 'feed_exporter_test'
# this is equivalent to what you would set in settings.py file
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'inputLinks2.csv'
}
filePath='inputLinks2.csv'
if os.path.exists(filePath):
os.remove(filePath)
else:
print("Can not delete the file as it doesn't exists")
start_urls = ['https://www.mytravelexp.com/']
def parse(self, response):
titles = response.xpath("//a/#href").extract()
for title in titles:
yield {'title': title}
def __getid(self, url):
mm = url.split("&refer")[0] #or something like that
return mm
def request_seen(self, request):
fp = self.__getid(request.url)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
if self.file:
self.file.write(fp + os.linesep)
Please help!!

Scrapy filters duplicate requests by default.

Related

How to export scrapped data using FEEDS/FEED EXPORTS in scrapy

I'm new to webscraping/scrapy and python
Scrapy version: Scrapy 2.5.1
OS: windows
IDE: pycharm
I am trying to use FEEDS option in scrapy to automatically export the scrapped data from a website to download into excel
Tried following solution but didn't work stackoverflow solution not sure what i'm doing wrong here am i missing something?
i also tried to add the same in my settings.py file after commenting custom_settings in my spider class as per example provided in documentation: https://docs.scrapy.org/en/latest/topics/feed-exports.html?highlight=feed#feeds
for now i achieved my requirement using spider_closed (signal) to write data to CSV by storing all the scraped items data in a array called result
class SpiderFC(scrapy.Spider):
name = "FC"
start_urls = [
url,
]
custom_setting = {"FEEDS": {r"C:\Users\rreddy\PycharmProjects\fcdc\webscrp\outputfinal.csv": {"format": "csv", "overwrite": True}}}
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(SpiderFC, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def __init__(self, name=None):
super().__init__(name)
self.count = None
def parse(self, response, **kwargs):
# each item scrapped from parent page has links where the actual data need to be scrapped so i follow each link and scrape data
yield response.follow(notice_href_follow, callback=self.parse_item,
meta={'item': item, 'index': index, 'next_page': next_page})
def parse_item(self, response):
# logic for items to scrape goes here
# they are saved to temp list and appended to result array and then temp list is cleared
result.append(it) # result data is used at the end to write to csv
item.clear()
if next_page:
yield next(self.follow_next(response, next_page))
def follow_next(self, response, next_page):
next_page_url = urljoin(url, next_page[0])
yield response.follow(next_page_url, callback=self.parse)
spider closed signal
def spider_closed(self, spider):
with open(output_path, mode="a", newline='') as f:
writer = csv.writer(f)
for v in result:
writer.writerow([v["city"]])
when all data is scraped and all requests are completed spider_closed signal will write the data to a csv but i'm trying to avoid this logic or code and use inbuilt exporter from scrapy but I'm having trouble in exporting the data

Check your path. If you are on windows then provide the full path in the custom_settings e.g. as below
custom_settings = {
"FEEDS":{r"C:\Users\Name\Path\To\outputfinal.csv" : {"format" : "csv", "overwrite":True}}
}
If you are on linux or MAC then provide the path as below:
custom_settings = {
"FEEDS":{r"/Path/to/folder/fcdc/webscrp/outputfinal.csv" : {"format" : "csv", "overwrite":True}}
}
Alternatively provide the relative path as below which will create a folder structure of fcdc>>webscrp>>outputfinal.csv in the directory from which the spider is run from.
custom_settings = {
"FEEDS":{r"./fcdc/webscrp/outputfinal.csv" : {"format" : "csv", "overwrite":True}}
}

Scrapy dont save values to items

Since today, my spider wont save any information to my items "DuifpicturesItem".
I got almost the same spider created for a different customer, but this one wont save anything, idk why. My items.py only have two fields: Images and Link
In my console, i can see, that i collects the right data, but it doenst save it
My Code
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import DuifpicturesItem
from scrapy.http import Request, FormRequest
import csv
class DuifLogin(CrawlSpider):
name = "duiflogin"
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
custom_settings = {'FEED_EXPORT_FIELDS' : ['SKU', 'Title', 'Price', 'Link', 'Title_small', 'NL_PL_PC', 'Description' ] }
with open("duifonlylinks.csv","r") as f:
reader = csv.DictReader(f)
start_urls = [items['Link'] for items in reader]
rules = (
Rule(
LinkExtractor(),
callback='parse_page',
follow=True
),
)
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
return FormRequest.from_response(response,formdata={
'username' : '****',
'password' : '****',
'submit' : ''
}, callback=self.after_loging)
def after_loging(self, response):
accview = response.xpath('//div[#class="c-accountbox clearfix js-match-height"]/h3')
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.parse_page)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No product', response.url)
for a in productpage:
items = DuifpicturesItem()
items['Link'] = response.url
items['Images'] = response.xpath('//div[#class="inner"]/img/#src').getall()
yield items
My console
here you can see, that it scrapes links and images like i want to, but the .csv/.json file still empty
P.S
the login data isnt correct, but for this proccess, i dont have to be login, so i guess, it doenst effect the crawling process.

Not sure of what you mean by "save it". Since you made no mention to a pipeline, I'm assuming you don't have one for handling your items, so your Items are beign kept on memory only.
If you want to save your scraped items into a file you need to use the feed export. Simplest way would be:
scrapy crawl myspider -o items.json
It supports other formats, check the documentation.
If you meant to save into a DB or do something else with the data check the ItemPipelines.

Python + Scrapy: Issues running "ImagesPipeline" when running crawler from script

I'm brand new to Python so I apologize if there's a dumb mistake here...I've been scouring the web for days, looking at similar issues and combing through Scrapy docs and nothing seems to really resolve this for me...
I have a Scrapy project which successfully scrapes the source website, returns the required items, and then uses an ImagePipeline to download (and then rename accordingly) the images from the returned image links... but only when I run from the terminal with "runspider".
Whenever I use "crawl" from the terminal or CrawlProcess to run the spider from within the script, it returns the items but does not download the images and, I assume, completely misses the ImagePipeline.
I read that I needed to import my settings when running this way in order to properly load the pipeline, which makes sense after looking into the differences between "crawl" and "runspider" but I still cannot get the pipeline working.
There are no error messages but I notice that it does return "[scrapy.middleware] INFO: Enabled item pipelines: []" ... Which I assumed was showing that it is still missing my pipeline?
Here's my spider.py:
import scrapy
from scrapy2.items import Scrapy2Item
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class spider1(scrapy.Spider):
name = "spider1"
domain = "https://www.amazon.ca/s?k=821826022317"
def start_requests(self):
yield scrapy.Request(url=spider1.domain ,callback = self.parse)
def parse(self, response):
items = Scrapy2Item()
titlevar = response.css('span.a-text-normal ::text').extract_first()
imgvar = [response.css('img ::attr(src)').extract_first()]
skuvar = response.xpath('//meta[#name="keywords"]/#content')[0].extract()
items['title'] = titlevar
items['image_urls'] = imgvar
items['sku'] = skuvar
yield items
process = CrawlerProcess(get_project_settings())
process.crawl(spider1)
process.start()
Here is my items.py:
import scrapy
class Scrapy2Item(scrapy.Item):
title = scrapy.Field()
image_urls = scrapy.Field()
sku = scrapy.Field()
Here is my pipelines.py:
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class Scrapy2Pipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [scrapy.Request(x, meta={'image_name': item['sku']})
for x in item.get('image_urls', [])]
def file_path(self, request, response=None, info=None):
return '%s.jpg' % request.meta['image_name']
Here is my settings.py:
BOT_NAME = 'scrapy2'
SPIDER_MODULES = ['scrapy2.spiders']
NEWSPIDER_MODULE = 'scrapy2.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'scrapy2.pipelines.Scrapy2Pipeline': 1,
}
IMAGES_STORE = 'images'
Thank you to anybody that looks at this or even attempts to help me out. It's greatly appreciated.

Since you are running your spider as a script, there is no scrapy project environment, get_project_settings won't work (aside from grabbing the default settings).
The script must be self-contained, i.e. contain everything you need to run your spider (or import it from your python search path, like any regular old python code).
I've reformatted that code for you, so that it runs, when you execute it with the plain python interpreter: python3 script.py.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import scrapy
from scrapy.pipelines.images import ImagesPipeline
BOT_NAME = 'scrapy2'
ROBOTSTXT_OBEY = True
IMAGES_STORE = 'images'
class Scrapy2Item(scrapy.Item):
title = scrapy.Field()
image_urls = scrapy.Field()
sku = scrapy.Field()
class Scrapy2Pipeline(ImagesPipeline):
def get_media_requests(self, item, info):
return [scrapy.Request(x, meta={'image_name': item['sku']})
for x in item.get('image_urls', [])]
def file_path(self, request, response=None, info=None):
return '%s.jpg' % request.meta['image_name']
class spider1(scrapy.Spider):
name = "spider1"
domain = "https://www.amazon.ca/s?k=821826022317"
def start_requests(self):
yield scrapy.Request(url=spider1.domain ,callback = self.parse)
def parse(self, response):
items = Scrapy2Item()
titlevar = response.css('span.a-text-normal ::text').extract_first()
imgvar = [response.css('img ::attr(src)').extract_first()]
skuvar = response.xpath('//meta[#name="keywords"]/#content')[0].extract()
items['title'] = titlevar
items['image_urls'] = imgvar
items['sku'] = skuvar
yield items
if __name__ == "__main__":
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
settings = Settings(values={
'BOT_NAME': BOT_NAME,
'ROBOTSTXT_OBEY': ROBOTSTXT_OBEY,
'ITEM_PIPELINES': {
'__main__.Scrapy2Pipeline': 1,
},
'IMAGES_STORE': IMAGES_STORE,
'TELNETCONSOLE_ENABLED': False,
})
process = CrawlerProcess(settings=settings)
process.crawl(spider1)
process.start()

Scrapy Files Pipeline not downloading files

I have been tasked with building a web crawler that downloads all .pdfs in a given site. Spider runs on local machine and on scraping hub. For some reason when I run it only downloads some but not all of the pdfs. This can be seen by looking at the items in the output JSON.
I have set MEDIA_ALLOW_REDIRECTS = True and tried to run it on scrapinghub as well as locally
Here is my spider
import scrapy
from scrapy.loader import ItemLoader
from poc_scrapy.items import file_list_Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class PdfCrawler(CrawlSpider):
# loader = ItemLoader(item=file_list_Item())
downloaded_set = {''}
name = 'example'
allowed_domains = ['www.groton.org']
start_urls = ['https://www.groton.org']
rules=(
Rule(LinkExtractor(allow='www.groton.org'), callback='parse_page', follow=True),
)
def parse_page(self, response):
print('parseing' , response)
pdf_urls = []
link_urls = []
other_urls = []
# print("this is the response", response.text)
all_href = response.xpath('/html/body//a/#href').extract()
# classify all links
for href in all_href:
if len(href) < 1:
continue
if href[-4:] == '.pdf':
pdf_urls.append(href)
elif href[0] == '/':
link_urls.append(href)
else:
other_urls.append(href)
# get the links that have pdfs and send them to the item pipline
for pdf in pdf_urls:
if pdf[0:5] != 'http':
new_pdf = response.urljoin(pdf)
if new_pdf in self.downloaded_set:
# we have seen it before, dont do anything
# print('skipping ', new_pdf)
pass
else:
loader = ItemLoader(item=file_list_Item())
# print(self.downloaded_set)
self.downloaded_set.add(new_pdf)
loader.add_value('file_urls', new_pdf)
loader.add_value('base_url', response.url)
yield loader.load_item()
else:
if new_pdf in self.downloaded_set:
pass
else:
loader = ItemLoader(item=file_list_Item())
self.downloaded_set.add(new_pdf)
loader.add_value('file_urls', new_pdf)
loader.add_value('base_url', response.url)
yield loader.load_item()
settings.py
MEDIA_ALLOW_REDIRECTS = True
BOT_NAME = 'poc_scrapy'
SPIDER_MODULES = ['poc_scrapy.spiders']
NEWSPIDER_MODULE = 'poc_scrapy.spiders'
ROBOTSTXT_OBEY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,'poc_scrapy.middlewares.UserAgentMiddlewareRotator': 400,
}
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline':1
}
FILES_STORE = 'pdfs/'
AUTOTHROTTLE_ENABLED = True
here is the output a small portion of the output
{
"file_urls": [
"https://www.groton.org/ftpimages/542/download/download_3402393.pdf"
],
"base_url": [
"https://www.groton.org/parents/business-office"
],
"files": []
},
as you can see the pdf file is in the file_urls but not downloaded, there are 5 warning messages that indicate that the some of them can not be downloaded but there are over 20 missing files.
Here is the warning message I get for some of the files
[scrapy.pipelines.files] File (code: 301): Error downloading file from <GET http://groton.myschoolapp.com/ftpimages/542/download/Candidate_Statement_2013.pdf> referred in <None>
[scrapy.core.downloader.handlers.http11] Received more bytes than download warn size (33554432) in request <GET https://groton.myschoolapp.com/ftpimages/542/download/download_1474034.pdf>
.
I would expect that all the files will be download or at least a warning message for all files that are not downloaded. Maybe there is a workaround.
Any feedback is greatly appreciated. Thanks!

UPDATE: I realized that the problem was that robots.txt was not allowing me to visit some of the pdfs. This could be fixed by using an other service to download them or by not following robots.txt

2 functions in scrapy spider and the second one not running

I am using scrapy to get the content inside some urls on a page, similar to this question here:
Use scrapy to get list of urls, and then scrape content inside those urls
I am able to get the subURLs from my start urls(first def), However, my second def doesn't seem to be passing through. And the result file is empty. I have tested the content inside the function in scrapy shell and it is getting the info I want, but not when I am running the spider.
import scrapy
from scrapy.selector import Selector
#from scrapy import Spider
from WheelsOnlineScrapper.items import Dealer
from WheelsOnlineScrapper.url_list import urls
import logging
from urlparse import urljoin
logger = logging.getLogger(__name__)
class WheelsonlinespiderSpider(scrapy.Spider):
logger.info('Spider starting')
name = 'wheelsonlinespider'
rotate_user_agent = True # lives in middleware.py and settings.py
allowed_domains = ["https://wheelsonline.ca"]
start_urls = urls # this list is created in url_list.py
logger.info('URLs retrieved')
def parse(self, response):
subURLs = []
partialURLs = response.css('.directory_name::attr(href)').extract()
for i in partialURLs:
subURLs = urljoin('https://wheelsonline.ca/', i)
yield scrapy.Request(subURLs, callback=self.parse_dealers)
logger.info('Dealer ' + subURLs + ' fetched')
def parse_dealers(self, response):
logger.info('Beginning of page')
dlr = Dealer()
#Extracting the content using css selectors
try:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first() + ' ' + response.css(".dealer_head_aux_name::text").extract_first()
except TypeError:
dlr['DealerName'] = response.css(".dealer_head_main_name::text").extract_first()
dlr['MailingAddress'] = ','.join(response.css(".dealer_address_right::text").extract())
dlr['PhoneNumber'] = response.css(".dealer_head_phone::text").extract_first()
logger.info('Dealer fetched ' + dlr['DealerName'])
yield dlr
logger.info('End of page')

Your allowed_domains list contains the protocol (https). It should have only the domain name as per the documentation:
allowed_domains = ["wheelsonline.ca"]
Also, you should've received a message in your log:
URLWarning: allowed_domains accepts only domains, not URLs. Ignoring URL entry https://wheelsonline.ca in allowed_domains

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy Crawler:Avoid Duplicate Crawling of URLs - python

Scrapy filters duplicate requests by default.

Related

How to export scrapped data using FEEDS/FEED EXPORTS in scrapy

Scrapy dont save values to items

Python + Scrapy: Issues running "ImagesPipeline" when running crawler from script

Scrapy Files Pipeline not downloading files

2 functions in scrapy spider and the second one not running

Categories

Resources