scrapy parsing first page - python

I'm using scrapy .24.4, I'm trying to scrape some information from threatexpert and I've almost got it, I can grab all the information on all the pages EXCEPT the first page(or start_url). I've tried parse_start_url and adding Rules and just can't get it to work. I'm sure it's just something I've overlooked but I've been looking at it all weekend and just need a break. I'd appreciate if anyone has any suggestions etc. Oh I did get it to work with a range in the start_url but it looked kind of inelegant and I'm trying to learn the right way. Many thanks in advance!!
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse
class ThreatExpertSpider(scrapy.Spider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]
def parse(self, response):
print '++++++++++++++++++++++++pull all page links+++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_links)
def parse_links(self, response):
print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_items)
def parse_items(self, response):
self.log("Hi, this is an item page! %s" % response.url)
item = ThreatExpert()
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
yield item

Many, many thanks for the response it led me to what I got to work! Just had the wrong class instead of class ThreatExpertSpider(scrapy.Spider), I used class ThreatExpertSpider(CrawlSpider):, I'm still not entirely sure how it works but it does. I know RTFM, lol, but I'm learning. Here is what worked for me in case anyone else is looking for this.
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse
class ThreatExpertSpider(CrawlSpider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=3&sl=1"]
rules = (
Rule(SgmlLinkExtractor(allow=r'page=\d'), callback='parse_links', follow=True),
)
def parse_start_url(self, response):
print '++++++++++++++++++++++++parse_start_url+++++++++++++++++++++++'
return self.parse_items(response)
# urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
# for url in urls:
# url = urlparse.urljoin(response.url, url)
# self.log('Found follow url: %s' % url)
# yield scrapy.Request(url, callback = self.parse_links)
def parse_links(self, response):
print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_items)
def parse_items(self, response):
self.log("Hi, this is an item page! %s" % response.url)
item = ThreatExpert()
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
# item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
# if item['callback']:
# item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
# else:
# del item['callback']
yield item

Please refer the below code, This is works for me. If you have any queries please update through command.
from scrapy.spider import BaseSpider
from scrapy.http import Request
import re
from urlparse import urljoin
from scrapy.selector import HtmlXPathSelector
from threatexpert.items import ThreatExpert
import inspect
class usmallspider(BaseSpider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
for url in urls:
url = urljoin(response.url, url)
print url
if url:
yield Request(url, callback=self.parse_links)
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urljoin(response.url, url)
if url:
yield Request(url, callback = self.parse_items)
def parse_items(self, response):
itm=[]
item = MallUk1Item()
hxs = HtmlXPathSelector(response)
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
itm.append(item)
return itm

Related

Scrapy Multiple loops for Multiple URLs

I can get the loop to work great except I am getting 3-4 loops each time...I tried removing the start_urls reference but then the scraping stops working
import scrapy
from scrapy.http import Request, FormRequest
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
for i in self.start_urls:
yield Request(
url = i+"/general/information.html?kind=item",
callback=self.action)
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
print(drum)
for i in self.start_urls:
yield Request(
url = i+"/net/wired/tcpip.html",
callback=self.action2)
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
print(tcpip)
Scrapy uses elements from start_urls to run parse() - later you should get url from response without loop.
def postlogin2(self, response):
yield Request(
response.url + "/general/information.html?kind=item",
callback=self.action)
or rather
def postlogin2(self, response):
yield Request(
response.urljoin("/general/information.html?kind=item"),
callback=self.action)
or
def postlogin2(self, response):
yield response.follow("/general/information.html?kind=item", callback=self.action)
Do the same with other loops.
Doc: Response.urljoin(), Response.follow()

Scrapy: Do not crawl links on other domains page

Below id my spider I created to get all links on NecToday.com for example.
import socket
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class PropertiesItem(scrapy.Item):
# Primary fields
title = scrapy.Field()
url = scrapy.Field()
class NecSpider(CrawlSpider):
name = "NecSpider"
#allowed_domains = ["nectoday.com"]
start_urls = ["http://nectoday.com"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
print(response.url)
item = PropertiesItem()
item["title"] = response.xpath("//title/text()").extract()
item["url"] = response.url
return(item)
This code starts to fetch all links present on site. Some of the pages have YouTube links as well. The problem is that once the first YouTube link is crawled, it starts to crawl other YouTube links referenced from the first YouTube link.
I want to crawl the first YouTube link, but no others. YouTube is just example. Tomorrow that can be another site as well. How can this be achieved?
Why not try something along the lines of this:
start_urls=["http://nectoday.com"]
def parse(self, response):
#parse whatever you need
for url in response.selector.xpath('//#href').extract():
if 'youtube.com' in url:
yield scrapy.Request(url, callback=self.parse_no_follow)
else:
yield scrapy.Request(url, callback=self.parse)
def parse_no_follow(self, response):
#parse whatever you want and not follow anymore links
This will only be scraping from your allowed domain.
class QuotesSpider(CrawlSpider):
name = "your app name"
n=0
allowed_domains = ['domain']
start_urls=['anywebpage']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
QuotesSpider.n=QuotesSpider.n+1
if (len(response.body)>100):
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.body_width = 0
dd = response.body.decode("utf-8")
init=dd.find("<p>")
while init>0:
end = dd.find("</p>", init)
if end>0:
o=h.handle(dd[init:end+4]+"\n")
supersentences=o.split('\n')

using scrapy extracting data inside links

I have been trying to extract data from consumercomplaints.in the title and the data inside those title links.I wrote the following code and unable to parse through the links and extract the data and also I am unable to extract all the links related.plz guide
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem
class criticspider(CrawlSpider):
name ="comp"
allowed_domains =["consumercomplaints.in"]
#start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
rules=(
Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.select('.//td[#class="complaint"]/a/span/text()').extract()
item['link'] = site.select('.//td[#class="complaint"]/a/#href').extract()
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
# item['intro'] = site.select('.//td[#class="small"]//a[2]/text()').extract()
# item['heading'] = site.select('.//td[#class="compl-text"]/div/b[1]/text()').extract()
# item['date'] = site.select('.//td[#class="small"]/text()[2]').extract()
# item['complaint'] = site.select('.//td[#class="compl-text"]/div/text()').extract()
items.append(item)
def anchor_page(self, response):
hxs = Selector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['data']=hxs.select('.//td[#class="compl-text"]/div/text()').extract()
yield old_item
Are you using an old version of Scrapy?
In the latest stable version you don't need to do hxs = Selector(response) nor using the hxs.select() method. You can do the same thing just with response.xpath().
I think the problem in your code is that the result of select() (or response.xpath) is actually a Python list, so you need to do:
link = site.select('.//td[#class="complaint"]/a/#href').extract()
if link:
item['link'] = link[0]
You probably want to do a similar thing for title too.
EDIT: I got it working with a few changes:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
Rule(
SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
callback="parse",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[#width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//td[#class="complaint"]/a/span/text()').extract()[0]
item['link'] = site.xpath('.//td[#class="complaint"]/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[#class="compl-text"]/div/text()').extract()
yield old_item

Website scraping and screenshots

I am scrapping a website using scrapy and storing the internal/external links in my items class.
Is there a way that when the link is scrapped, I can capture the screenshot of it ?
Note : the website has a login authorisation form.
My Code (spider.py)
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import urlparse
from scrapy import log
class MySpider(CrawlSpider):
items = []
failed_urls = []
duplicate_responses = []
name = 'myspiders'
allowed_domains = ['someurl.com']
login_page = 'someurl.com/login_form'
start_urls = 'someurl.com/'
rules = [Rule(SgmlLinkExtractor(deny=('logged_out', 'logout',)), follow=True, callback='parse_start_url')]
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.login,
dont_filter=False
)
def login(self, response):
"""Generate a login request."""
return FormRequest.from_response(response,
formnumber=1,
formdata={'username': 'username', 'password': 'password' },
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logout" in response.body:
self.log("Successfully logged in. Let's start crawling! :%s" % response, level=log.INFO)
self.log("Response Url : %s" % response.url, level=log.INFO)
yield Request(url=self.start_urls)
else:
self.log("Bad times :(", loglevel=log.INFO)
def parse_start_url(self, response):
# Scrape data from page
hxs = HtmlXPathSelector(response)
self.log('response came in from : %s' % (response), level=log.INFO)
# check for some important page to crawl
if response.url == 'someurl.com/medical/patient-info' :
self.log('yes I am here', level=log.INFO)
urls = hxs.select('//a/#href').extract()
urls = list(set(urls))
for url in urls :
self.log('URL extracted : %s' % url, level=log.INFO)
item = DmozItem()
if response.status == 404 or response.status == 500:
self.failed_urls.append(response.url)
self.log('failed_url : %s' % self.failed_urls, level=log.INFO)
item['failed_urls'] = self.failed_urls
else :
if url.startswith('http') :
if url.startswith('someurl.com'):
item['internal_link'] = url
# Need to capture screenshot of the extracted url here
self.log('internal_link :%s' % url, level=log.INFO)
else :
item['external_link'] = url
# Need to capture screenshot of the extracted url here
self.log('external_link :%s' % url, level=log.INFO)
self.items.append(item)
self.items = list(set(self.items))
return self.items
else :
self.log('did not recieved expected response', level=log.INFO)
Update : I am using a Virtual Machine (logged-in through putty)
You can look at a rendering server like splash

Scrapy : crawling start_urls causing issues

In my start_urls if I define the home page then scrapy doesn't crawl the page and the "if" check in parse_item function is never hit (eg : 'someurl.com/medical/patient-info'). But when I provide the same page url in start url (i.e start_urls = 'someurl.com/medical/patient-info) it crawls it and hits the below check in parse_item
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import urlparse
from scrapy import log
class MySpider(CrawlSpider):
items = []
failed_urls = []
duplicate_responses = []
name = 'myspiders'
allowed_domains = ['someurl.com']
login_page = 'someurl.com/login_form'
start_urls = 'someurl.com/' # Facing problem for the url here
rules = [Rule(SgmlLinkExtractor(deny=('logged_out', 'logout',)), follow=True, callback='parse_item')]
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.login,
dont_filter=False
)
def login(self, response):
"""Generate a login request."""
return FormRequest.from_response(response,
formnumber=1,
formdata={'username': 'username', 'password': 'password' },
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logout" in response.body:
self.log("Successfully logged in. Let's start crawling! :%s" % response, level=log.INFO)
self.log("Response Url : %s" % response.url, level=log.INFO)
return Request(url=self.start_urls)
else:
self.log("Bad times :(", loglevel=log.INFO)
def parse_item(self, response):
# Scrape data from page
hxs = HtmlXPathSelector(response)
self.log('response came in from : %s' % (response), level=log.INFO)
# check for some important page to crawl
if response.url == 'someurl.com/medical/patient-info' :
self.log('yes I am here', level=log.INFO)
urls = hxs.select('//a/#href').extract()
urls = list(set(urls))
for url in urls :
self.log('URL extracted : %s' % url, level=log.INFO)
item = DmozItem()
if response.status == 404 or response.status == 500:
self.failed_urls.append(response.url)
self.log('failed_url : %s' % self.failed_urls, level=log.INFO)
item['failed_urls'] = self.failed_urls
else :
if url.startswith('http') :
if url.startswith('someurl.com'):
item['internal_link'] = url
self.log('internal_link :%s' % url, level=log.INFO)
else :
item['external_link'] = url
self.log('external_link :%s' % url, level=log.INFO)
self.items.append(item)
self.items = list(set(self.items))
return self.items
else :
self.log('did not recieved expected response', level=log.INFO)
I guess start_urls has to be a list.
Try the following: start_urls = ['http://www.someurl.com/', ]

Categories

Resources