Scrapy delay request

Scrapy delay request - python

every time i run my code my ip gets banned. I need help to delay each request for 10 seconds. I've tried to place DOWNLOAD_DELAY in code but it gives no results. Any help is appreciated.
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"https://washingtondc.craigslist.org/search/fua"
]
BASE_URL = 'https://washingtondc.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/nos/vgm/" + item_id
item = DmozItem()
item["link"] = response.url
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item

You need to set DOWNLOAD_DELAY in settings.py of your project. Note that you may also need to limit concurrency. By default concurrency is 8 so you are hitting website with 8 simultaneous requests.
# settings.py
DOWNLOAD_DELAY = 1
CONCURRENT_REQUESTS_PER_DOMAIN = 2
Starting with Scrapy 1.0 you can also place custom settings in spider, so you could do something like this:
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
]
custom_settings = {
"DOWNLOAD_DELAY": 5,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2
}
Delay and concurrency are set per downloader slot not per requests. To actually check what download you have you could try something like this
def parse(self, response):
"""
"""
delay = self.crawler.engine.downloader.slots["www.dmoz.org"].delay
concurrency = self.crawler.engine.downloader.slots["www.dmoz.org"].concurrency
self.log("Delay {}, concurrency {} for request {}".format(delay, concurrency, response.request))
return

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.

Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

scrapy getting values from multiple sites

I'm trying to pass a value from a function.
i looked up the docs and just didn't understand it.
ref:
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
yield item
here is a psudo code of what i want to achive:
import scrapy
class GotoSpider(scrapy.Spider):
name = 'goto'
allowed_domains = ['first.com', 'second.com]
start_urls = ['http://first.com/']
def parse(self, response):
name = response.xpath(...)
price = scrapy.Request(second.com, callback = self.parse_check)
yield(name, price)
def parse_check(self, response):
price = response.xpath(...)
return price

This is how you can pass any value, link etc to other methods:
import scrapy
class GotoSpider(scrapy.Spider):
name = 'goto'
allowed_domains = ['first.com', 'second.com']
start_urls = ['http://first.com/']
def parse(self, response):
name = response.xpath(...)
link = response.xpath(...) # link for second.com where you may find the price
request = scrapy.Request(url=link, callback = self.parse_check)
request.meta['name'] = name
yield request
def parse_check(self, response):
name = response.meta['name']
price = response.xpath(...)
yield {"name":name,"price":price} #Assuming that in your "items.py" the fields are declared as name, price

What is the mistake here?

These are my codes but it seems to be correct,but it doesn't work,please help
HEADER_XPATH = ['//h1[#class="story-body__h1"]//text()']
AUTHOR_XPATH = ['//span[#class="byline__name"]//text()']
PUBDATE_XPATH = ['//div/#data-datetime']
WTAGS_XPATH = ['']
CATEGORY_XPATH = ['//span[#rev="news|source""]//text()']
TEXT = ['//div[#property="articleBody"]//p//text()']
INTERLINKS = ['//div[#class="story-body__link"]//p//a/#href']
DATE_FORMAT_STRING = '%Y-%m-%d'
class BBCSpider(Spider):
name = "bbc"
allowed_domains = ["bbc.com"]
sitemap_urls = [
'http://Www.bbc.com/news/sitemap/',
'http://www.bbc.com/news/technology/',
'http://www.bbc.com/news/science_and_environment/']
def parse_page(self, response):
items = []
item = ContentItems()
item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
item['resource'] = urlparse(response.url).hostname
item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
item['tags'] = process_array_item(self, response, TAGS_XPATH, single=False)
item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
item['article_text'] = process_article_text(self, response, TEXT)
item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
item['link'] = response.url
items.append(item)
return items

Your spider is just badly structured and because of that it does nothing.
The scrapy.Spider spider requires start_urls class attribute which should contains list of urls that the spider will use to start the crawl, all of these urls will callback to class method parse which means it's required as well.
Your spider has sitemap_urls class attribute and it's not being used anywhere, also your spider has parse_page class method that is never used anywhere either.
So in short your spider should look something like this:
class BBCSpider(Spider):
name = "bbc"
allowed_domains = ["bbc.com"]
start_urls = [
'http://Www.bbc.com/news/sitemap/',
'http://www.bbc.com/news/technology/',
'http://www.bbc.com/news/science_and_environment/']
def parse(self, response):
# This is a page with all of the articles
article_urls = # find article urls in the pages
for url in article_urls:
yield Request(url, self.parse_page)
def parse_page(self, response):
# This is an article page
items = []
item = ContentItems()
# populate item
return item

Scrapy not collecting properly emails

I'm using Scrapy to collect some data and everything works fine except the email extraction part. For some reason email row in .csv file is blank or there is only a few emails extracted. I've tried limiting download_delay and CLOSESPIDER_ITEMCOUNT but it's not working. Any help is much appreciated.
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["hanford.craigslist.org"]
start_urls = [
"http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
]
BASE_URL = 'http://hanford.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/sdo/cto/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item

First of all, a quote from Terms of Use as a warning:
USE. You agree not to use or provide software (except for general
purpose web browsers and email clients, or software expressly licensed
by us) or services that interact or interoperate with CL, e.g. for
downloading, uploading, posting, flagging, emailing, search, or mobile
use. Robots, spiders, scripts, scrapers, crawlers, etc. are
prohibited, as are misleading, unsolicited, unlawful, and/or spam
postings/email. You agree not to collect users' personal and/or
contact information ("PI").
Several things to fix here:
the contact information is under reply/hnf/cto/ instead of reply/sdo/cto/
specify User-Agent and X-Requested-With headers
The complete code that works for me:
import re
from urlparse import urljoin
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["hanford.craigslist.org"]
start_urls = [
"http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
]
BASE_URL = 'http://hanford.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = urljoin(self.BASE_URL, link)
yield scrapy.Request(absolute_url,
callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = urljoin(self.BASE_URL, "reply/hnf/cto/" + item_id)
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url,
meta={'item': item},
callback=self.parse_contact,
headers={"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36"})
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item

Combine FormRequest and CrawlSpider

I need to apply FormRequest [From here][1]:
#Request = FormRequest.from_response(
# response,
# formname='frmSearch',
# formdata={'classtype': 'of'},
# #callback=self.parse_links,
# dont_filter=True,
#
# )
For link in start_urls and to all pages that I get from the rules in my СrawlSpider.
class QuokaSpider(CrawlSpider):
name = 'quoka'
allowed_domains = ['www.quoka.de']
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen/']
curr_page = 0
rules = (Rule(LinkExtractor(allow=(r'.+'), restrict_xpaths = [u'//li[#class="arr-rgt active"]',]),
follow=True, callback='parse_links'),
)
def _url(self, url):
return 'http://www.quoka.de' + url
def parse_links(self, response):
hxs = Selector(response)
lnks = hxs.xpath('//a[contains(#class, "img-lmtr") and contains(#class, "multi") or contains(#class, "single")]/#href').extract()
filters = hxs.xpath(u'//div[#class="modal-title"]/text()').extract()
for fil in filters:
print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+fil+"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
for url in lnks:
request = Request(self._url(url), callback=self.parse_object)
yield request
def parse_object(self, response):
item = AnbieterItem()
hxs = Selector(response)
item['Beschreibung'] = hxs.xpath(u'//div[#class="text"]/text()').extract()
# item['Kleinanzeigen_App'] = '1'
# item['Preis'] = '1'
return item
If I try to use "start_request" to the filter, the spider does not use pages from the rules.
How can I solve this problem and apply this filter to start url and urls from rules?

I don't know how to combine CrawlSpider Rules with FormRequest but I'd like to suggest that you replace the CrawlSpider with a generic Spider and create the Requests manually.
The Rule in your code does only take care of following the pagination (as far as i can see). To replace that you could use something like in the following code sample:
import scrapy
class TestSpider(scrapy.Spider):
name = 'quoka'
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen']
def parse(self, response):
request = scrapy.FormRequest.from_response(
response,
formname='frmSearch',
formdata={'classtype': 'of'},
callback=self.parse_filtered
)
print request.body
yield request
def parse_filtered(self,response):
resultList = response.xpath('//div[#id="ResultListData"]/ul/li')
for resultRow in resultList:
xpath_Result_Details = './/div[#class="q-col n2"]/a'
# Check if row has details
if resultRow.xpath(xpath_Result_Details):
result_Details = resultRow.xpath(xpath_Result_Details)
# If YES extract details
title = result_Details.xpath('./#title').extract()
href = result_Details.xpath('./#href').extract()[0]
# Code to request detail pages goes here ...
print title, href
# Use this instead of CrawlSpider to follow the pagination links
xpath_NextPage = '//div[#class="rslt-pagination"]//li[#class="arr-rgt active"]/a'
if response.xpath(xpath_NextPage):
nextPage_href = response.xpath(xpath_NextPage + '/#href').extract()[0]
nextPage_url = 'http://www.quoka.de/immobilien/bueros-gewerbeflaechen' + nextPage_href
nextPage_num = response.xpath(xpath_NextPage + '/#data-qng-page').extract()[0]
# request = scrapy.Request(nextPage_url, callback=self.parse_filtered)
# Create request with formdata ...
request = scrapy.FormRequest.from_response(
response,
formname='frmNaviSearch',
formdata={'pageno': nextPage_num},
callback=self.parse_filtered
)
yield request

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy delay request - python

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

scrapy getting values from multiple sites

What is the mistake here?

Scrapy not collecting properly emails

Combine FormRequest and CrawlSpider

Categories

Resources