Scrapy Multiple loops for Multiple URLs - python

I can get the loop to work great except I am getting 3-4 loops each time...I tried removing the start_urls reference but then the scraping stops working
import scrapy
from scrapy.http import Request, FormRequest
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
for i in self.start_urls:
yield Request(
url = i+"/general/information.html?kind=item",
callback=self.action)
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
print(drum)
for i in self.start_urls:
yield Request(
url = i+"/net/wired/tcpip.html",
callback=self.action2)
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
print(tcpip)

Scrapy uses elements from start_urls to run parse() - later you should get url from response without loop.
def postlogin2(self, response):
yield Request(
response.url + "/general/information.html?kind=item",
callback=self.action)
or rather
def postlogin2(self, response):
yield Request(
response.urljoin("/general/information.html?kind=item"),
callback=self.action)
or
def postlogin2(self, response):
yield response.follow("/general/information.html?kind=item", callback=self.action)
Do the same with other loops.
Doc: Response.urljoin(), Response.follow()

Related

why my start_request function is not calling my parse function in my scrapy program?

I am trying the scrape the review of this particular IMDB title. But for some reason the start_request is not calling the parse function for this title alone. for another title, it seems to work.
Code examples:
class imdb(scrapy.Spider):
name = 'imdb'
def start_requests(self):
c=("https://www.imdb.com/title/tt8217188/reviews")
yield SeleniumRequest(
url=c,
wait_time=4,
screenshot=True,
callback=self.parse)
def parse(self, response):
print("done")
Looks like you didnt bother to read docs
https://github.com/clemfromspace/scrapy-selenium
Anyways, try this code
from scrapy_selenium import SeleniumRequest
class MyBotSpider(scrapy.Spider):
name = 'mybot'
custom_settings = {
'SELENIUM_DRIVER_TYPE': 'executable',
'SELENIUM_DRIVER_NAME': 'chrome',
'SELENIUM_DRIVER_EXECUTABLE_PATH': r'c:\chromedriver.exe',
'SELENIUM_DRIVER_ARGUMENTS':[],
'DOWNLOADER_MIDDLEWARES': {
'scrapy_selenium.SeleniumMiddleware': 800
},
}
def start_requests(self):
yield SeleniumRequest(
url="https://www.imdb.com/title/tt8217188/reviews",
wait_time=4,
screenshot=True,
callback=self.parse)
def parse(self, response):
self.logger.info("Here")

How could I request urls just changes last part of url

In spider, I just want to request URLs that have one rule.
URLs list :
www.example.com/bread/coffee/A
www.example.com/bread/coffee/B
www.example.com/bread/coffee/C
www.example.com/bread/coffee/D
so start_request is www.example.com/bread/coffee/A
and then what gonna have to do in def parse ??
class MySpider(scrapy.Spider):
name = 'exmple.com'
start_urls = [www.example.com/bread/coffee/A]
def parse(self, response):
???
yield ???
a little hint will appreciate
you can use code like this:
class MySpider(scrapy.Spider):
name = 'exmple.com'
start_urls = ['www.example.com/bread/coffee/A']
def start_requests(self):
urls = [
'www.example.com/bread/coffee/A',
'www.example.com/bread/coffee/B',
'www.example.com/bread/coffee/C',
'www.example.com/bread/coffee/D'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# find what you need
yield # your item/dict
also, make sure your url(s) has correct format, they should contains http or https
aslo you can use string module to generate your urls
import string
def start_requests(self):
url = 'www.example.com/bread/coffee/{}'
for l in string.ascii_uppercase:
url = url.format(l)
yield scrapy.Request(url=url, callback=self.parse)

Scrapy yeild items from multiple requests

I am trying to yield items from different requests as shown here. If I add items = PrintersItem() to each request I get endless loops.. It I take it out other errors occur. Not sure how to combine yield request with yield items for each
import scrapy
from scrapy.http import Request, FormRequest
from ..items import PrintersItem
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
items = PrintersItem()
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
url = response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action)
for items in self.postlogin2(response):
yield items
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
url = response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2)
for items in self.action(response):
yield items
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
items['tcpip'] = tcpip
for items in self.action2(response):
yield items
If you want to send items from parse to postlogin2, etc. then add it as meta data in Request
yield Request( ..., meta={"items": items})
and get it in other function
items = response.meta["items"]
and yield it only in the last function
yield items
Doc: Request and Response, Request.meta special keys
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35',
'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
print(token)
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self, response):
items = PrintersItem()
contact = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[1]/text()[last()]').extract()
location = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/ul[1]/li[2]/text()[last()]').extract()
items['contact'] = contact
items['location'] = location
yield Request(
#url=response.urljoin("/general/information.html?kind=item"),
url=response.url.split('/general')[0] + "/general/information.html?kind=item",
callback=self.action,
meta={"items": items})
def action(self, response):
items = response.meta["items"]
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
items['drum'] = drum
print(drum)
printermodel = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[5]/dl[1]/dd[1]/text()').extract()
items['printermodel'] = printermodel
yield Request(
#url=response.urljoin("/net/wired/tcpip.html"),
url=response.url.split('/general')[0] + "/net/wired/tcpip.html",
callback=self.action2,
meta={"items": items})
def action2(self, response):
items = response.meta["items"]
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
items['tcpip'] = tcpip
yield items

scrapy parsing first page

I'm using scrapy .24.4, I'm trying to scrape some information from threatexpert and I've almost got it, I can grab all the information on all the pages EXCEPT the first page(or start_url). I've tried parse_start_url and adding Rules and just can't get it to work. I'm sure it's just something I've overlooked but I've been looking at it all weekend and just need a break. I'd appreciate if anyone has any suggestions etc. Oh I did get it to work with a range in the start_url but it looked kind of inelegant and I'm trying to learn the right way. Many thanks in advance!!
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse
class ThreatExpertSpider(scrapy.Spider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]
def parse(self, response):
print '++++++++++++++++++++++++pull all page links+++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_links)
def parse_links(self, response):
print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_items)
def parse_items(self, response):
self.log("Hi, this is an item page! %s" % response.url)
item = ThreatExpert()
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
yield item
Many, many thanks for the response it led me to what I got to work! Just had the wrong class instead of class ThreatExpertSpider(scrapy.Spider), I used class ThreatExpertSpider(CrawlSpider):, I'm still not entirely sure how it works but it does. I know RTFM, lol, but I'm learning. Here is what worked for me in case anyone else is looking for this.
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse
class ThreatExpertSpider(CrawlSpider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=3&sl=1"]
rules = (
Rule(SgmlLinkExtractor(allow=r'page=\d'), callback='parse_links', follow=True),
)
def parse_start_url(self, response):
print '++++++++++++++++++++++++parse_start_url+++++++++++++++++++++++'
return self.parse_items(response)
# urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
# for url in urls:
# url = urlparse.urljoin(response.url, url)
# self.log('Found follow url: %s' % url)
# yield scrapy.Request(url, callback = self.parse_links)
def parse_links(self, response):
print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_items)
def parse_items(self, response):
self.log("Hi, this is an item page! %s" % response.url)
item = ThreatExpert()
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
# item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
# if item['callback']:
# item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
# else:
# del item['callback']
yield item
Please refer the below code, This is works for me. If you have any queries please update through command.
from scrapy.spider import BaseSpider
from scrapy.http import Request
import re
from urlparse import urljoin
from scrapy.selector import HtmlXPathSelector
from threatexpert.items import ThreatExpert
import inspect
class usmallspider(BaseSpider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
for url in urls:
url = urljoin(response.url, url)
print url
if url:
yield Request(url, callback=self.parse_links)
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urljoin(response.url, url)
if url:
yield Request(url, callback = self.parse_items)
def parse_items(self, response):
itm=[]
item = MallUk1Item()
hxs = HtmlXPathSelector(response)
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
itm.append(item)
return itm

Scrapy : crawling start_urls causing issues

In my start_urls if I define the home page then scrapy doesn't crawl the page and the "if" check in parse_item function is never hit (eg : 'someurl.com/medical/patient-info'). But when I provide the same page url in start url (i.e start_urls = 'someurl.com/medical/patient-info) it crawls it and hits the below check in parse_item
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import urlparse
from scrapy import log
class MySpider(CrawlSpider):
items = []
failed_urls = []
duplicate_responses = []
name = 'myspiders'
allowed_domains = ['someurl.com']
login_page = 'someurl.com/login_form'
start_urls = 'someurl.com/' # Facing problem for the url here
rules = [Rule(SgmlLinkExtractor(deny=('logged_out', 'logout',)), follow=True, callback='parse_item')]
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.login,
dont_filter=False
)
def login(self, response):
"""Generate a login request."""
return FormRequest.from_response(response,
formnumber=1,
formdata={'username': 'username', 'password': 'password' },
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logout" in response.body:
self.log("Successfully logged in. Let's start crawling! :%s" % response, level=log.INFO)
self.log("Response Url : %s" % response.url, level=log.INFO)
return Request(url=self.start_urls)
else:
self.log("Bad times :(", loglevel=log.INFO)
def parse_item(self, response):
# Scrape data from page
hxs = HtmlXPathSelector(response)
self.log('response came in from : %s' % (response), level=log.INFO)
# check for some important page to crawl
if response.url == 'someurl.com/medical/patient-info' :
self.log('yes I am here', level=log.INFO)
urls = hxs.select('//a/#href').extract()
urls = list(set(urls))
for url in urls :
self.log('URL extracted : %s' % url, level=log.INFO)
item = DmozItem()
if response.status == 404 or response.status == 500:
self.failed_urls.append(response.url)
self.log('failed_url : %s' % self.failed_urls, level=log.INFO)
item['failed_urls'] = self.failed_urls
else :
if url.startswith('http') :
if url.startswith('someurl.com'):
item['internal_link'] = url
self.log('internal_link :%s' % url, level=log.INFO)
else :
item['external_link'] = url
self.log('external_link :%s' % url, level=log.INFO)
self.items.append(item)
self.items = list(set(self.items))
return self.items
else :
self.log('did not recieved expected response', level=log.INFO)
I guess start_urls has to be a list.
Try the following: start_urls = ['http://www.someurl.com/', ]

Categories

Resources