Scrapy : crawling start_urls causing issues

Scrapy : crawling start_urls causing issues - python

In my start_urls if I define the home page then scrapy doesn't crawl the page and the "if" check in parse_item function is never hit (eg : 'someurl.com/medical/patient-info'). But when I provide the same page url in start url (i.e start_urls = 'someurl.com/medical/patient-info) it crawls it and hits the below check in parse_item
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import urlparse
from scrapy import log
class MySpider(CrawlSpider):
items = []
failed_urls = []
duplicate_responses = []
name = 'myspiders'
allowed_domains = ['someurl.com']
login_page = 'someurl.com/login_form'
start_urls = 'someurl.com/' # Facing problem for the url here
rules = [Rule(SgmlLinkExtractor(deny=('logged_out', 'logout',)), follow=True, callback='parse_item')]
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.login,
dont_filter=False
)
def login(self, response):
"""Generate a login request."""
return FormRequest.from_response(response,
formnumber=1,
formdata={'username': 'username', 'password': 'password' },
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logout" in response.body:
self.log("Successfully logged in. Let's start crawling! :%s" % response, level=log.INFO)
self.log("Response Url : %s" % response.url, level=log.INFO)
return Request(url=self.start_urls)
else:
self.log("Bad times :(", loglevel=log.INFO)
def parse_item(self, response):
# Scrape data from page
hxs = HtmlXPathSelector(response)
self.log('response came in from : %s' % (response), level=log.INFO)
# check for some important page to crawl
if response.url == 'someurl.com/medical/patient-info' :
self.log('yes I am here', level=log.INFO)
urls = hxs.select('//a/#href').extract()
urls = list(set(urls))
for url in urls :
self.log('URL extracted : %s' % url, level=log.INFO)
item = DmozItem()
if response.status == 404 or response.status == 500:
self.failed_urls.append(response.url)
self.log('failed_url : %s' % self.failed_urls, level=log.INFO)
item['failed_urls'] = self.failed_urls
else :
if url.startswith('http') :
if url.startswith('someurl.com'):
item['internal_link'] = url
self.log('internal_link :%s' % url, level=log.INFO)
else :
item['external_link'] = url
self.log('external_link :%s' % url, level=log.INFO)
self.items.append(item)
self.items = list(set(self.items))
return self.items
else :
self.log('did not recieved expected response', level=log.INFO)

I guess start_urls has to be a list.
Try the following: start_urls = ['http://www.someurl.com/', ]

Related

Scrapy correct form data but cant login

I already managed on some webpages to successfully log in and start scraping data, but i noticed, that i got some big problems with other pages.
I checked in GoogleChrome the form data and there is only username and password to fill in, so i did it in my code, but i cant login...
My Code
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import DuifloginItem
from scrapy.http import Request, FormRequest
import csv
class DuifLogin(CrawlSpider):
name = "duiflogin"
allowed_domains = ['duif.nl']
login_page = 'https://www.duif.nl/login'
custom_settings = {'FEED_EXPORT_FIELDS' : ['SKU', 'Title', 'Price', 'Link', 'Title_small', 'NL_PL_PC', 'Description' ] }
with open("duifonlylinks.csv","r") as f:
reader = csv.DictReader(f)
start_urls = [items['Link'] for items in reader]
rules = (
Rule(
LinkExtractor(),
callback='parse_page',
follow=True
),
)
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
return FormRequest.from_response(response,formdata={
'username' : 'not real',
'password' : 'login data',
}, callback=self.after_loging)
def after_loging(self, response):
accview = response.xpath('//div[#class="c-accountbox clearfix js-match-height"]/h3')
if accview:
print('success')
else:
print(':(')
for url in self.start_urls:
yield response.follow(url=url, callback=self.parse_page)
def parse_page(self, response):
productpage = response.xpath('//div[#class="product-details col-md-12"]')
if not productpage:
print('No product', response.url)
for a in productpage:
items = DuifloginItem()
items['Link'] = response.url
items['SKU'] = response.xpath('//p[#class="desc"]/text()').get().strip()
items['Price'] = response.xpath('//span[#class="regular"]/text()').get()
items['Title'] = response.xpath('//h1[#class="product-title"]/text()').get()
items['Title_small'] = response.xpath('//div[#class="left"]/p/text()').get()
items['NL_PL_PC'] = response.xpath('//div[#class="desc"]/ul/li/em/text()').getall()
items['Description'] = response.xpath('//div[#class="item"]/p/text()').getall()
yield items
Here you can see form data on the 302 POST:
Do i miss cookies? If yes, i cant find any on the whole domain...

Scrapy Multiple loops for Multiple URLs

I can get the loop to work great except I am getting 3-4 loops each time...I tried removing the start_urls reference but then the scraping stops working
import scrapy
from scrapy.http import Request, FormRequest
from scrapy.utils.response import open_in_browser
class PrinterSpider(scrapy.Spider):
name = 'printers'
start_urls = ['http://192.168.137.9', 'http://192.168.137.35', 'http://192.168.137.34', 'http://192.168.137.27', 'http://192.168.137.21' ]
def parse(self, response):
token = response.xpath('//*[#name="CSRFToken"]/#value').extract_first()
yield FormRequest.from_response(response, formnumber=1, formdata={
'CSRFToken' : token,
'B55d' : 'password',
'loginurl' : '/general/status.html'
}, callback=self.postlogin2)
def postlogin2(self,response):
for i in self.start_urls:
yield Request(
url = i+"/general/information.html?kind=item",
callback=self.action)
def action(self,response):
drum = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[7]/dl[1]/dd[1]/text()').extract()
print(drum)
for i in self.start_urls:
yield Request(
url = i+"/net/wired/tcpip.html",
callback=self.action2)
def action2(self, response):
tcpip = response.xpath('//html[1]/body[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/form[1]/div[4]/dl[1]/dd[2]/input[1]/#value').extract()
print(tcpip)

Scrapy uses elements from start_urls to run parse() - later you should get url from response without loop.
def postlogin2(self, response):
yield Request(
response.url + "/general/information.html?kind=item",
callback=self.action)
or rather
def postlogin2(self, response):
yield Request(
response.urljoin("/general/information.html?kind=item"),
callback=self.action)
or
def postlogin2(self, response):
yield response.follow("/general/information.html?kind=item", callback=self.action)
Do the same with other loops.
Doc: Response.urljoin(), Response.follow()

Scrapy: Crawl angular ng-href links?

I'm using selenium-webdriver to render javascript for a scrapy crawler, but it doesn't looks like the angularjs 'ng-href' links are crawled. Does scrapy crawl 'ng-href' links? If not, how can I get it to crawl the 'ng-href' links?
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from CAP.items import Website
from scrapy.mail import MailSender
from scrapy.http import Request
from selenium import webdriver
import time
from scrapy.http import TextResponse
class HomeSpider(CrawlSpider):
name = "capseleniums"
allowed_domains = ["www.ecommerce.com", "learn.ecommerce.com", "health.ecommerce.com", "wm15.ecommerce.com", "wm13.ecommerce.com", "wm12.ecommerce.com" ]
handle_httpstatus_list = [500, 502, 503, 504, 400, 408, 404]
def start_requests(self):
start_urls = reversed( [
'http://wm12.ecommerce.com/health-wellness-center/',
'http://wm13.ecommerce.com/Cook/',
'http://wm15.ecommerce.com/electronics-resource-center/',
'http://health.ecommerce.com/vitamins-wellness-center/',
'http://learn.ecommerce.com/Tips-Ideas/',
] )
return [ Request(url = start_url) for start_url in start_urls ]
def trim(link_text):
return link_text.strip(' \t\n\r')
rules = (
Rule(
LinkExtractor(
allow=(),
deny=(),
process_value=trim,
),
callback="parse_items",
follow=False,),
)
def __init__(self, category=None, *args, **kwargs):
self.driver = webdriver.PhantomJS(service_args=['--load-images=no'])
super(HomeSpider, self).__init__(*args, **kwargs)
def __del__(self):
self.driver.stop()
def parse_items(self, response):
hxs = self.driver
hxs.get(response.url)
time.sleep(1)
body = hxs.page_source
sel_response = TextResponse(url=response.url, body=body, encoding = 'utf-8')
hxs = Selector(sel_response)
sites = hxs.xpath('//html')
items = []
if response.status == 404:
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
if hxs.xpath('/html/head/title/text()[contains(.,"invalid")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
elif hxs.xpath('//head/link[#rel="canonical"]/#href[contains(.,"invalid-category-id")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items
else:
if hxs.xpath('//*[#class="result-summary-container"]/text()[contains(.,"Showing 0 of")]'):
for site in sites:
item = Website()
item['url'] = response.meta.get('redirect_urls', [response.url])[0]
item['referer'] = response.request.headers.get('Referer')
item['status'] = response.status
items.append(item)
return items

By default, it would look for links in the href attribute of a and area tags.
You just need to additionally configure attrs argument and include ng-href attribute:
LinkExtractor(attrs=['href', 'ng-href'], callback="parse_items", follow=False),

scrapy parsing first page

I'm using scrapy .24.4, I'm trying to scrape some information from threatexpert and I've almost got it, I can grab all the information on all the pages EXCEPT the first page(or start_url). I've tried parse_start_url and adding Rules and just can't get it to work. I'm sure it's just something I've overlooked but I've been looking at it all weekend and just need a break. I'd appreciate if anyone has any suggestions etc. Oh I did get it to work with a range in the start_url but it looked kind of inelegant and I'm trying to learn the right way. Many thanks in advance!!
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse
class ThreatExpertSpider(scrapy.Spider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]
def parse(self, response):
print '++++++++++++++++++++++++pull all page links+++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_links)
def parse_links(self, response):
print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_items)
def parse_items(self, response):
self.log("Hi, this is an item page! %s" % response.url)
item = ThreatExpert()
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
yield item

Many, many thanks for the response it led me to what I got to work! Just had the wrong class instead of class ThreatExpertSpider(scrapy.Spider), I used class ThreatExpertSpider(CrawlSpider):, I'm still not entirely sure how it works but it does. I know RTFM, lol, but I'm learning. Here is what worked for me in case anyone else is looking for this.
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse
class ThreatExpertSpider(CrawlSpider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=3&sl=1"]
rules = (
Rule(SgmlLinkExtractor(allow=r'page=\d'), callback='parse_links', follow=True),
)
def parse_start_url(self, response):
print '++++++++++++++++++++++++parse_start_url+++++++++++++++++++++++'
return self.parse_items(response)
# urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
# for url in urls:
# url = urlparse.urljoin(response.url, url)
# self.log('Found follow url: %s' % url)
# yield scrapy.Request(url, callback = self.parse_links)
def parse_links(self, response):
print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urlparse.urljoin(response.url, url)
self.log('Found follow url: %s' % url)
yield scrapy.Request(url, callback = self.parse_items)
def parse_items(self, response):
self.log("Hi, this is an item page! %s" % response.url)
item = ThreatExpert()
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
# item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
# if item['callback']:
# item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
# else:
# del item['callback']
yield item

Please refer the below code, This is works for me. If you have any queries please update through command.
from scrapy.spider import BaseSpider
from scrapy.http import Request
import re
from urlparse import urljoin
from scrapy.selector import HtmlXPathSelector
from threatexpert.items import ThreatExpert
import inspect
class usmallspider(BaseSpider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
urls = response.xpath('//a[contains(#href, "page")]/#href').extract()
for url in urls:
url = urljoin(response.url, url)
print url
if url:
yield Request(url, callback=self.parse_links)
def parse_links(self, response):
hxs = HtmlXPathSelector(response)
urls = response.xpath('//a[contains(#href, "md5")]/#href').extract()
for url in urls:
url = urljoin(response.url, url)
if url:
yield Request(url, callback = self.parse_items)
def parse_items(self, response):
itm=[]
item = MallUk1Item()
hxs = HtmlXPathSelector(response)
item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
itm.append(item)
return itm

Website scraping and screenshots

I am scrapping a website using scrapy and storing the internal/external links in my items class.
Is there a way that when the link is scrapped, I can capture the screenshot of it ?
Note : the website has a login authorisation form.
My Code (spider.py)
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.selector import HtmlXPathSelector
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import urlparse
from scrapy import log
class MySpider(CrawlSpider):
items = []
failed_urls = []
duplicate_responses = []
name = 'myspiders'
allowed_domains = ['someurl.com']
login_page = 'someurl.com/login_form'
start_urls = 'someurl.com/'
rules = [Rule(SgmlLinkExtractor(deny=('logged_out', 'logout',)), follow=True, callback='parse_start_url')]
def start_requests(self):
yield Request(
url=self.login_page,
callback=self.login,
dont_filter=False
)
def login(self, response):
"""Generate a login request."""
return FormRequest.from_response(response,
formnumber=1,
formdata={'username': 'username', 'password': 'password' },
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logout" in response.body:
self.log("Successfully logged in. Let's start crawling! :%s" % response, level=log.INFO)
self.log("Response Url : %s" % response.url, level=log.INFO)
yield Request(url=self.start_urls)
else:
self.log("Bad times :(", loglevel=log.INFO)
def parse_start_url(self, response):
# Scrape data from page
hxs = HtmlXPathSelector(response)
self.log('response came in from : %s' % (response), level=log.INFO)
# check for some important page to crawl
if response.url == 'someurl.com/medical/patient-info' :
self.log('yes I am here', level=log.INFO)
urls = hxs.select('//a/#href').extract()
urls = list(set(urls))
for url in urls :
self.log('URL extracted : %s' % url, level=log.INFO)
item = DmozItem()
if response.status == 404 or response.status == 500:
self.failed_urls.append(response.url)
self.log('failed_url : %s' % self.failed_urls, level=log.INFO)
item['failed_urls'] = self.failed_urls
else :
if url.startswith('http') :
if url.startswith('someurl.com'):
item['internal_link'] = url
# Need to capture screenshot of the extracted url here
self.log('internal_link :%s' % url, level=log.INFO)
else :
item['external_link'] = url
# Need to capture screenshot of the extracted url here
self.log('external_link :%s' % url, level=log.INFO)
self.items.append(item)
self.items = list(set(self.items))
return self.items
else :
self.log('did not recieved expected response', level=log.INFO)
Update : I am using a Virtual Machine (logged-in through putty)

You can look at a rendering server like splash

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy : crawling start_urls causing issues - python

I guess start_urls has to be a list. Try the following: start_urls = ['http://www.someurl.com/', ]

Related

Scrapy correct form data but cant login

Scrapy Multiple loops for Multiple URLs

Scrapy: Crawl angular ng-href links?

scrapy parsing first page

Website scraping and screenshots

Categories

Resources