Website scraping and screenshots - python

I am scrapping a website using scrapy and storing the internal/external links in my items class.
Is there a way that when the link is scrapped, I can capture the screenshot of it ?
Note : the website has a login authorisation form.
My Code (spider.py)
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import urlparse
from scrapy import log
class MySpider(CrawlSpider):
items = []
failed_urls = []
duplicate_responses = []
name = 'myspiders'
allowed_domains = ['someurl.com']
login_page = 'someurl.com/login_form'
start_urls = 'someurl.com/'
rules = [Rule(SgmlLinkExtractor(deny=('logged_out', 'logout',)), follow=True, callback='parse_start_url')]
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.login,
dont_filter=False
)
def login(self, response):
"""Generate a login request."""
return FormRequest.from_response(response,
formnumber=1,
formdata={'username': 'username', 'password': 'password' },
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logout" in response.body:
self.log("Successfully logged in. Let's start crawling! :%s" % response, level=log.INFO)
self.log("Response Url : %s" % response.url, level=log.INFO)
yield Request(url=self.start_urls)
else:
self.log("Bad times :(", loglevel=log.INFO)
def parse_start_url(self, response):
# Scrape data from page
hxs = HtmlXPathSelector(response)
self.log('response came in from : %s' % (response), level=log.INFO)
# check for some important page to crawl
if response.url == 'someurl.com/medical/patient-info' :
self.log('yes I am here', level=log.INFO)
urls = hxs.select('//a/#href').extract()
urls = list(set(urls))
for url in urls :
self.log('URL extracted : %s' % url, level=log.INFO)
item = DmozItem()
if response.status == 404 or response.status == 500:
self.failed_urls.append(response.url)
self.log('failed_url : %s' % self.failed_urls, level=log.INFO)
item['failed_urls'] = self.failed_urls
else :
if url.startswith('http') :
if url.startswith('someurl.com'):
item['internal_link'] = url
# Need to capture screenshot of the extracted url here
self.log('internal_link :%s' % url, level=log.INFO)
else :
item['external_link'] = url
# Need to capture screenshot of the extracted url here
self.log('external_link :%s' % url, level=log.INFO)
self.items.append(item)
self.items = list(set(self.items))
return self.items
else :
self.log('did not recieved expected response', level=log.INFO)
Update : I am using a Virtual Machine (logged-in through putty)

You can look at a rendering server like splash

Related

Scrapy tracking and scraping third page

after trying to add third page to this shenanigas i got an error "You can't mix str and non-str arguments". My goal is to use url from 'website' and scrap data from it. How do i do it?
Here is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
urlem = response.urljoin(website)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
yield params
def parseEmail(self,response, params=None):
hps = HtmlXPathSelector(response)
email = hxs.xpath('//body').re('([a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)')
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()
Thanks for help in advance.
A simple debugging pointed me to the error line:
urlem = response.urljoin(website) # You can't mix str and non-str arguments
website is a Selector, and urljoin needs a string.
Perhaps what you are looking for is this:
urlem = response.urljoin(website.xpath('.//a/#href').get())
Ok i solved it.
I just moved yield a bit.
Yield can't just take non existent strings, string needs to be created first,
that's why i got problems before.
Website url was scraped in parseMain not in parse.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request, Spider
class RynekMainSpider(scrapy.Spider):
name = "RynekMain"
start_urls = [
'https://rynekpierwotny.pl/deweloperzy/?page=1']
def parse(self, response):
websites = response.css('div#root')[0]
PAGETEST = response.xpath('//a[contains(#class,"rp-173nt6g")]/../following-sibling::li').css('a::attr(href)').get()
for website in websites.css('li.rp-np9kb1'):
page = website.css('a::attr(href)').get()
address = website.css('address.rp-o9b83y::text').get()
name = website.css('h2.rp-69f2r4::text').get()
params = {
'address' : address,
'name' : name,
'href' : page,
}
url = response.urljoin(page)
yield Request(url=url, cb_kwargs={'params': params}, callback=self.parseMain)
yield Request(url=response.urljoin(PAGETEST), callback=self.parse)
def parseMain(self, response, params=None):
# print(response.url)
website = response.css('div.rp-l0pkv6 a::attr(href)').get()
params['website'] = website
urlem = response.urljoin(website)
yield Request(url=urlem, cb_kwargs={'params': params}, callback=self.parseEmail)
def parseEmail(self,response, params=None):
email = response.css('div.m-Footer__company a::attr(href)').get()
params['email'] = email
yield params
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(RynekMainSpider)
process.start()

Scrapy correct form data but cant login

I already managed on some webpages to successfully log in and start scraping data, but i noticed, that i got some big problems with other pages.
I checked in GoogleChrome the form data and there is only username and password to fill in, so i did it in my code, but i cant login...
My Code
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import DuifloginItem
from scrapy.http import Request, FormRequest
import csv
class DuifLogin(CrawlSpider):
name = "duiflogin"
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
custom_settings = {'FEED_EXPORT_FIELDS' : ['SKU', 'Title', 'Price', 'Link', 'Title_small', 'NL_PL_PC', 'Description' ] }
with open("duifonlylinks.csv","r") as f:
reader = csv.DictReader(f)
start_urls = [items['Link'] for items in reader]
rules = (
Rule(
LinkExtractor(),
callback='parse_page',
follow=True
),
)
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
return FormRequest.from_response(response,formdata={
'username' : 'not real',
'password' : 'login data',
}, callback=self.after_loging)
def after_loging(self, response):
accview = response.xpath('//div[#class="c-accountbox clearfix js-match-height"]/h3')
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.parse_page)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No product', response.url)
for a in productpage:
items = DuifloginItem()
items['Link'] = response.url
items['SKU'] = response.xpath('//p[#class="desc"]/text()').get().strip()
items['Price'] = response.xpath('//span[#class="regular"]/text()').get()
items['Title'] = response.xpath('//h1[#class="product-title"]/text()').get()
items['Title_small'] = response.xpath('//div[#class="left"]/p/text()').get()
items['NL_PL_PC'] = response.xpath('//div[#class="desc"]/ul/li/em/text()').getall()
items['Description'] = response.xpath('//div[#class="item"]/p/text()').getall()
yield items
Here you can see form data on the 302 POST:
Do i miss cookies? If yes, i cant find any on the whole domain...

Scrapy Multiple loops for Multiple URLs

I can get the loop to work great except I am getting 3-4 loops each time...I tried removing the start_urls reference but then the scraping stops working
import scrapy
from scrapy.http import Request, FormRequest
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
for i in self.start_urls:
yield Request(
url = i+"/general/information.html?kind=item",
callback=self.action)
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
print(drum)
for i in self.start_urls:
yield Request(
url = i+"/net/wired/tcpip.html",
callback=self.action2)
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
print(tcpip)
Scrapy uses elements from start_urls to run parse() - later you should get url from response without loop.
def postlogin2(self, response):
yield Request(
response.url + "/general/information.html?kind=item",
callback=self.action)
or rather
def postlogin2(self, response):
yield Request(
response.urljoin("/general/information.html?kind=item"),
callback=self.action)
or
def postlogin2(self, response):
yield response.follow("/general/information.html?kind=item", callback=self.action)
Do the same with other loops.
Doc: Response.urljoin(), Response.follow()

scrapy parsing first page

I'm using scrapy .24.4, I'm trying to scrape some information from threatexpert and I've almost got it, I can grab all the information on all the pages EXCEPT the first page(or start_url). I've tried parse_start_url and adding Rules and just can't get it to work. I'm sure it's just something I've overlooked but I've been looking at it all weekend and just need a break. I'd appreciate if anyone has any suggestions etc. Oh I did get it to work with a range in the start_url but it looked kind of inelegant and I'm trying to learn the right way. Many thanks in advance!!
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse
class ThreatExpertSpider(scrapy.Spider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]
def parse(self, response):
print '++++++++++++++++++++++++pull all page links+++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_links)
def parse_links(self, response):
print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_items)
def parse_items(self, response):
self.log("Hi, this is an item page! %s" % response.url)
item = ThreatExpert()
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
yield item
Many, many thanks for the response it led me to what I got to work! Just had the wrong class instead of class ThreatExpertSpider(scrapy.Spider), I used class ThreatExpertSpider(CrawlSpider):, I'm still not entirely sure how it works but it does. I know RTFM, lol, but I'm learning. Here is what worked for me in case anyone else is looking for this.
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse
class ThreatExpertSpider(CrawlSpider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=3&sl=1"]
rules = (
Rule(SgmlLinkExtractor(allow=r'page=\d'), callback='parse_links', follow=True),
)
def parse_start_url(self, response):
print '++++++++++++++++++++++++parse_start_url+++++++++++++++++++++++'
return self.parse_items(response)
# urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
# for url in urls:
# url = urlparse.urljoin(response.url, url)
# self.log('Found follow url: %s' % url)
# yield scrapy.Request(url, callback = self.parse_links)
def parse_links(self, response):
print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_items)
def parse_items(self, response):
self.log("Hi, this is an item page! %s" % response.url)
item = ThreatExpert()
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
# item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
# if item['callback']:
# item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
# else:
# del item['callback']
yield item
Please refer the below code, This is works for me. If you have any queries please update through command.
from scrapy.spider import BaseSpider
from scrapy.http import Request
import re
from urlparse import urljoin
from scrapy.selector import HtmlXPathSelector
from threatexpert.items import ThreatExpert
import inspect
class usmallspider(BaseSpider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
for url in urls:
url = urljoin(response.url, url)
print url
if url:
yield Request(url, callback=self.parse_links)
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urljoin(response.url, url)
if url:
yield Request(url, callback = self.parse_items)
def parse_items(self, response):
itm=[]
item = MallUk1Item()
hxs = HtmlXPathSelector(response)
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
itm.append(item)
return itm

Scrapy : crawling start_urls causing issues

In my start_urls if I define the home page then scrapy doesn't crawl the page and the "if" check in parse_item function is never hit (eg : 'someurl.com/medical/patient-info'). But when I provide the same page url in start url (i.e start_urls = 'someurl.com/medical/patient-info) it crawls it and hits the below check in parse_item
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import urlparse
from scrapy import log
class MySpider(CrawlSpider):
items = []
failed_urls = []
duplicate_responses = []
name = 'myspiders'
allowed_domains = ['someurl.com']
login_page = 'someurl.com/login_form'
start_urls = 'someurl.com/' # Facing problem for the url here
rules = [Rule(SgmlLinkExtractor(deny=('logged_out', 'logout',)), follow=True, callback='parse_item')]
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.login,
dont_filter=False
)
def login(self, response):
"""Generate a login request."""
return FormRequest.from_response(response,
formnumber=1,
formdata={'username': 'username', 'password': 'password' },
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logout" in response.body:
self.log("Successfully logged in. Let's start crawling! :%s" % response, level=log.INFO)
self.log("Response Url : %s" % response.url, level=log.INFO)
return Request(url=self.start_urls)
else:
self.log("Bad times :(", loglevel=log.INFO)
def parse_item(self, response):
# Scrape data from page
hxs = HtmlXPathSelector(response)
self.log('response came in from : %s' % (response), level=log.INFO)
# check for some important page to crawl
if response.url == 'someurl.com/medical/patient-info' :
self.log('yes I am here', level=log.INFO)
urls = hxs.select('//a/#href').extract()
urls = list(set(urls))
for url in urls :
self.log('URL extracted : %s' % url, level=log.INFO)
item = DmozItem()
if response.status == 404 or response.status == 500:
self.failed_urls.append(response.url)
self.log('failed_url : %s' % self.failed_urls, level=log.INFO)
item['failed_urls'] = self.failed_urls
else :
if url.startswith('http') :
if url.startswith('someurl.com'):
item['internal_link'] = url
self.log('internal_link :%s' % url, level=log.INFO)
else :
item['external_link'] = url
self.log('external_link :%s' % url, level=log.INFO)
self.items.append(item)
self.items = list(set(self.items))
return self.items
else :
self.log('did not recieved expected response', level=log.INFO)
I guess start_urls has to be a list.
Try the following: start_urls = ['http://www.someurl.com/', ]

Categories

Resources