I am trying to yield items from different requests as shown here. If I add items = PrintersItem() to each request I get endless loops.. It I take it out other errors occur. Not sure how to combine yield request with yield items for each
import scrapy
from scrapy.http import Request, FormRequest
from ..items import PrintersItem
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
items = PrintersItem()
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
url = response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action)
for items in self.postlogin2(response):
yield items
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
url = response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2)
for items in self.action(response):
yield items
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
items['tcpip'] = tcpip
for items in self.action2(response):
yield items
If you want to send items from parse to postlogin2, etc. then add it as meta data in Request
yield Request( ..., meta={"items": items})
and get it in other function
items = response.meta["items"]
and yield it only in the last function
yield items
Doc: Request and Response, Request.meta special keys
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35',
'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self, response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
#url=response.urljoin("/general/information.html?kind=item"),
url=response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action,
meta={"items": items})
def action(self, response):
items = response.meta["items"]
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
#url=response.urljoin("/net/wired/tcpip.html"),
url=response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2,
meta={"items": items})
def action2(self, response):
items = response.meta["items"]
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
items['tcpip'] = tcpip
yield items
Related
I want to scrape the details present in json form using scrapy. They are multiple start_urls and each start_url have multiple pages to scrape with. I am just not able to get the logic of how to do so.
import scrapy
from scrapy.http import Request
BASE_URL = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
class ChangeSpider(scrapy.Spider):
name = 'change'
def start_requests(self):
for i in range(len(BASE_URL)):
yield Request(BASE_URL[i], callback = self.parse)
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)
Try like this:
import scrapy
from scrapy.http import Request
class ChangeSpider(scrapy.Spider):
name = 'change'
start_urls = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)
I have urls with multiple pages. I try to paginate to extract datas from theses urls hut it works only one time (juste one next_page). What's wrong ?
import json
import scrapy
import re
import pkgutil
from scrapy.loader import ItemLoader
from rzc_spider.items import AnnonceItem
class AnnonceSpider(scrapy.Spider):
name = 'rzc_results'
def __init__(self, *args, **kwargs):
data_file = pkgutil.get_data("rzc_spider", "json/input/test_tt.json")
self.data = json.loads(data_file)
def start_requests(self):
for item in self.data:
request = scrapy.Request(item['rzc_url'], callback=self.parse)
request.meta['item'] = item
yield request
def parse(self, response):
item = response.meta['item']
item['results'] = []
item["car_number"] = response.css(
"h2.sub::text").extract_first()
for caritem in response.css("div.ad > div[itemtype='https://schema.org/Vehicle']"):
data = AnnonceItem()
#model
data["model"] = caritem.css(
"em.title::text").extract_first()
item['results'].append(data)
yield item
next_page = response.css(
'a.link::attr(href)').extract_first()
if next_page is not None:
url_pagination = 'https://www.websiteexample.com' + next_page
meta = {'item': response.meta['item']}
yield scrapy.Request(url=url_pagination, callback=self.parse, meta=meta)
#ban proxies reaction
def response_is_ban(self, request, response):
return b'banned' in response.body
def exception_is_ban(self, request, exception):
return None
The json file with the url (a sample in this case):
[{
"rzc_url": "https://www.websiteexample.com/model"
}]
Try and check the URL. Sometimes they set traps so only next_page has a absolute URL and another one has a relative URL. Instead of combining url_pagination with next_page use urljoin. Import it
yield scrapy.Request(urljoin(response.url, item), callback=self.parse, meta=meta)
I can get the loop to work great except I am getting 3-4 loops each time...I tried removing the start_urls reference but then the scraping stops working
import scrapy
from scrapy.http import Request, FormRequest
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
for i in self.start_urls:
yield Request(
url = i+"/general/information.html?kind=item",
callback=self.action)
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
print(drum)
for i in self.start_urls:
yield Request(
url = i+"/net/wired/tcpip.html",
callback=self.action2)
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
print(tcpip)
Scrapy uses elements from start_urls to run parse() - later you should get url from response without loop.
def postlogin2(self, response):
yield Request(
response.url + "/general/information.html?kind=item",
callback=self.action)
or rather
def postlogin2(self, response):
yield Request(
response.urljoin("/general/information.html?kind=item"),
callback=self.action)
or
def postlogin2(self, response):
yield response.follow("/general/information.html?kind=item", callback=self.action)
Do the same with other loops.
Doc: Response.urljoin(), Response.follow()
I'm new to scrapy and python, I can able to download all the files but I want to download only specific Type files "EX-10", So that it will download followinh files. ( Ex-10.1, Ex-10.2 to EX-10.99).
My Code
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//td[#scope="row"]/a/#href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
and Scrapy want to check for next pages also... (upto last page), Its not working Fine.
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#value="Next 40"]',)), callback="parse", follow= True),)
# follow next page links
next_page = response.xpath('.//a[#value="Next 40"]/#href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany' + next_href
request = scrapy.Request(url=next_page_url)
yield request
Your problem seems to have been solved. The following script should fetch you the required files from that site following every pagination links and downloading those files like the way you wanted to have.
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
nextpage = response.css("input[value='Next 40']::attr(onclick)")
if nextpage:
tpage = nextpage.extract_first().split("parent.location=")[1].replace("'","")
nlink = response.urljoin(tpage)
yield scrapy.Request(url=nlink, callback = self.parse)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//tr[td[starts-with(., "EX-")]]/td/a[contains(#href, ".htm") or contains(#href, ".txt")]/#href').extract():
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
You need to use a FilesPipeline, but the one that scrapy provides generates the file name based on the hash of the URL.
If you want a custom file name, you have to make your own FilesPipeline like this:
import scrapy, os
from scrapy.pipelines.files import FilesPipeline
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
custom_settings = {
'ITEM_PIPELINES': {'myspider.MyFilesPipeline': 1},
'FILES_STORE': '/my/valid/path/',
}
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//td[#scope="row"]/a/#href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
yield {
'file_urls': [response.urljoin(links)]
}
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]
I need to apply FormRequest [From here][1]:
#Request = FormRequest.from_response(
# response,
# formname='frmSearch',
# formdata={'classtype': 'of'},
# #callback=self.parse_links,
# dont_filter=True,
#
# )
For link in start_urls and to all pages that I get from the rules in my СrawlSpider.
class QuokaSpider(CrawlSpider):
name = 'quoka'
allowed_domains = ['www.quoka.de']
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen/']
curr_page = 0
rules = (Rule(LinkExtractor(allow=(r'.+'), restrict_xpaths = [u'//li[#class="arr-rgt active"]',]),
follow=True, callback='parse_links'),
)
def _url(self, url):
return 'http://www.quoka.de' + url
def parse_links(self, response):
hxs = Selector(response)
lnks = hxs.xpath('//a[contains(#class, "img-lmtr") and contains(#class, "multi") or contains(#class, "single")]/#href').extract()
filters = hxs.xpath(u'//div[#class="modal-title"]/text()').extract()
for fil in filters:
print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+fil+"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
for url in lnks:
request = Request(self._url(url), callback=self.parse_object)
yield request
def parse_object(self, response):
item = AnbieterItem()
hxs = Selector(response)
item['Beschreibung'] = hxs.xpath(u'//div[#class="text"]/text()').extract()
# item['Kleinanzeigen_App'] = '1'
# item['Preis'] = '1'
return item
If I try to use "start_request" to the filter, the spider does not use pages from the rules.
How can I solve this problem and apply this filter to start url and urls from rules?
I don't know how to combine CrawlSpider Rules with FormRequest but I'd like to suggest that you replace the CrawlSpider with a generic Spider and create the Requests manually.
The Rule in your code does only take care of following the pagination (as far as i can see). To replace that you could use something like in the following code sample:
import scrapy
class TestSpider(scrapy.Spider):
name = 'quoka'
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen']
def parse(self, response):
request = scrapy.FormRequest.from_response(
response,
formname='frmSearch',
formdata={'classtype': 'of'},
callback=self.parse_filtered
)
print request.body
yield request
def parse_filtered(self,response):
resultList = response.xpath('//div[#id="ResultListData"]/ul/li')
for resultRow in resultList:
xpath_Result_Details = './/div[#class="q-col n2"]/a'
# Check if row has details
if resultRow.xpath(xpath_Result_Details):
result_Details = resultRow.xpath(xpath_Result_Details)
# If YES extract details
title = result_Details.xpath('./#title').extract()
href = result_Details.xpath('./#href').extract()[0]
# Code to request detail pages goes here ...
print title, href
# Use this instead of CrawlSpider to follow the pagination links
xpath_NextPage = '//div[#class="rslt-pagination"]//li[#class="arr-rgt active"]/a'
if response.xpath(xpath_NextPage):
nextPage_href = response.xpath(xpath_NextPage + '/#href').extract()[0]
nextPage_url = 'http://www.quoka.de/immobilien/bueros-gewerbeflaechen' + nextPage_href
nextPage_num = response.xpath(xpath_NextPage + '/#data-qng-page').extract()[0]
# request = scrapy.Request(nextPage_url, callback=self.parse_filtered)
# Create request with formdata ...
request = scrapy.FormRequest.from_response(
response,
formname='frmNaviSearch',
formdata={'pageno': nextPage_num},
callback=self.parse_filtered
)
yield request