I am new to scrapy, I have a base spider, similar to the example below:
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['example.com'] #the domain where the spider is allowed to crawl
start_urls = ['http://www.example.com/content/'] #url from which the spider will start crawling
page_incr = 1
flag = 0
def parse(self, response):
sel=Selector(response)
stuffs = sel.xpath('//a/#href')
for stuff in stuffs:
link = stuff.extract()
req1 = Request(url=link, callback=self.parse_item)
yield req1
url = 'http://www.example.com/content/?q=ajax//date/%d&page=%d' % (self.page_incr, self.page_incr)
req2 = Request(url=url,
headers={"Referer": "http://www.example.com/content", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse_xhr)
yield req2
def parse_xhr(self, response):
sel=Selector(response)
stuffs = sel.xpath('//a/#href')
for stuff in stuffs:
link = stuff.extract()
yield Request(url=link, callback=self.parse_item)
content = sel.xpath('//a/#href').extract()
if content == []:
self.flag +=1
if self.flag == 5:
raise CloseSpider('WARNING: <Spider forced to stop>')
else:
self.flag = 0
self.page_incr +=1
url = 'http://www.example.com/content/?q=ajax//date/%d&page=%d' % (self.page_incr, self.page_incr)
req3 = Request(url=url,
headers={"Referer": "http://www.example.com/content", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse_xhr)
yield req3
def parse_item(self, response):
pass
When I try to set it to crawl there is an error, this:
line 24, in parse
req1 = Request(url=link, callback=self.parse_item)
exceptions.AttributeError: 'MySpider' object has no attribute 'parse_item'
I am not getting it... Please help me seeing what is wrong!
Thanks for your time and help.
Your parse_item() method is incorrectly indented (with 5 spaces instead of 4).
Related
I'm new to scrapy and python, I can able to download all the files but I want to download only specific Type files "EX-10", So that it will download followinh files. ( Ex-10.1, Ex-10.2 to EX-10.99).
My Code
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//td[#scope="row"]/a/#href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
and Scrapy want to check for next pages also... (upto last page), Its not working Fine.
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#value="Next 40"]',)), callback="parse", follow= True),)
# follow next page links
next_page = response.xpath('.//a[#value="Next 40"]/#href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany' + next_href
request = scrapy.Request(url=next_page_url)
yield request
Your problem seems to have been solved. The following script should fetch you the required files from that site following every pagination links and downloading those files like the way you wanted to have.
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
nextpage = response.css("input[value='Next 40']::attr(onclick)")
if nextpage:
tpage = nextpage.extract_first().split("parent.location=")[1].replace("'","")
nlink = response.urljoin(tpage)
yield scrapy.Request(url=nlink, callback = self.parse)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//tr[td[starts-with(., "EX-")]]/td/a[contains(#href, ".htm") or contains(#href, ".txt")]/#href').extract():
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
You need to use a FilesPipeline, but the one that scrapy provides generates the file name based on the hash of the URL.
If you want a custom file name, you have to make your own FilesPipeline like this:
import scrapy, os
from scrapy.pipelines.files import FilesPipeline
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
custom_settings = {
'ITEM_PIPELINES': {'myspider.MyFilesPipeline': 1},
'FILES_STORE': '/my/valid/path/',
}
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//td[#scope="row"]/a/#href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
yield {
'file_urls': [response.urljoin(links)]
}
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]
I'm trying to pass a value from a function.
i looked up the docs and just didn't understand it.
ref:
def parse_page1(self, response):
item = MyItem()
item['main_url'] = response.url
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
yield item
here is a psudo code of what i want to achive:
import scrapy
class GotoSpider(scrapy.Spider):
name = 'goto'
allowed_domains = ['first.com', 'second.com]
start_urls = ['http://first.com/']
def parse(self, response):
name = response.xpath(...)
price = scrapy.Request(second.com, callback = self.parse_check)
yield(name, price)
def parse_check(self, response):
price = response.xpath(...)
return price
This is how you can pass any value, link etc to other methods:
import scrapy
class GotoSpider(scrapy.Spider):
name = 'goto'
allowed_domains = ['first.com', 'second.com']
start_urls = ['http://first.com/']
def parse(self, response):
name = response.xpath(...)
link = response.xpath(...) # link for second.com where you may find the price
request = scrapy.Request(url=link, callback = self.parse_check)
request.meta['name'] = name
yield request
def parse_check(self, response):
name = response.meta['name']
price = response.xpath(...)
yield {"name":name,"price":price} #Assuming that in your "items.py" the fields are declared as name, price
These are my codes but it seems to be correct,but it doesn't work,please help
HEADER_XPATH = ['//h1[#class="story-body__h1"]//text()']
AUTHOR_XPATH = ['//span[#class="byline__name"]//text()']
PUBDATE_XPATH = ['//div/#data-datetime']
WTAGS_XPATH = ['']
CATEGORY_XPATH = ['//span[#rev="news|source""]//text()']
TEXT = ['//div[#property="articleBody"]//p//text()']
INTERLINKS = ['//div[#class="story-body__link"]//p//a/#href']
DATE_FORMAT_STRING = '%Y-%m-%d'
class BBCSpider(Spider):
name = "bbc"
allowed_domains = ["bbc.com"]
sitemap_urls = [
'http://Www.bbc.com/news/sitemap/',
'http://www.bbc.com/news/technology/',
'http://www.bbc.com/news/science_and_environment/']
def parse_page(self, response):
items = []
item = ContentItems()
item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
item['resource'] = urlparse(response.url).hostname
item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
item['tags'] = process_array_item(self, response, TAGS_XPATH, single=False)
item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
item['article_text'] = process_article_text(self, response, TEXT)
item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
item['link'] = response.url
items.append(item)
return items
Your spider is just badly structured and because of that it does nothing.
The scrapy.Spider spider requires start_urls class attribute which should contains list of urls that the spider will use to start the crawl, all of these urls will callback to class method parse which means it's required as well.
Your spider has sitemap_urls class attribute and it's not being used anywhere, also your spider has parse_page class method that is never used anywhere either.
So in short your spider should look something like this:
class BBCSpider(Spider):
name = "bbc"
allowed_domains = ["bbc.com"]
start_urls = [
'http://Www.bbc.com/news/sitemap/',
'http://www.bbc.com/news/technology/',
'http://www.bbc.com/news/science_and_environment/']
def parse(self, response):
# This is a page with all of the articles
article_urls = # find article urls in the pages
for url in article_urls:
yield Request(url, self.parse_page)
def parse_page(self, response):
# This is an article page
items = []
item = ContentItems()
# populate item
return item
In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.
every time i run my code my ip gets banned. I need help to delay each request for 10 seconds. I've tried to place DOWNLOAD_DELAY in code but it gives no results. Any help is appreciated.
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"https://washingtondc.craigslist.org/search/fua"
]
BASE_URL = 'https://washingtondc.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/nos/vgm/" + item_id
item = DmozItem()
item["link"] = response.url
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
You need to set DOWNLOAD_DELAY in settings.py of your project. Note that you may also need to limit concurrency. By default concurrency is 8 so you are hitting website with 8 simultaneous requests.
# settings.py
DOWNLOAD_DELAY = 1
CONCURRENT_REQUESTS_PER_DOMAIN = 2
Starting with Scrapy 1.0 you can also place custom settings in spider, so you could do something like this:
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
]
custom_settings = {
"DOWNLOAD_DELAY": 5,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2
}
Delay and concurrency are set per downloader slot not per requests. To actually check what download you have you could try something like this
def parse(self, response):
"""
"""
delay = self.crawler.engine.downloader.slots["www.dmoz.org"].delay
concurrency = self.crawler.engine.downloader.slots["www.dmoz.org"].concurrency
self.log("Delay {}, concurrency {} for request {}".format(delay, concurrency, response.request))
return