Python Beautiful Soup Error : list index out of range

Python Beautiful Soup Error : list index out of range - python

I am getting the following error with the script, i want to do is,
if i get a correct URL, i want it to check with BeautifulSoup if there is a form with value button "Send"
Traceback (most recent call last):
File "tester.py", line 27, in
if viewstate[0]['value'] == "Send":
IndexError: list index out of range
#!/usr/bin/env python
import urllib2
import sys
import os
url = sys.argv[1]
open_dir_list = open("dirlist.txt",'r')
dirs = open_dir_list.read().split("\n")
open_dir_list.close()
for dir in dirs:
uri = url+"/"+dir
try:
from BeautifulSoup import BeautifulStoneSoup
response = urllib2.urlopen(uri)
if response.getcode() == 200:
s = uri
soup = BeautifulStoneSoup(uri)
viewstate = soup.findAll("input", {"type": "submit"})
if viewstate[0]['value'] == "Send":
print("Yay!!!")
else:
print("There's nothing here")
except urllib2.HTTPError, e:
if e.code == 401:
print "[!] Authorization Required %s " % (uri)
elif e.code == 403:
print "[!] Forbidden %s " % (uri)
elif e.code == 404:
print "[-] Not Found %s " % (uri)
elif e.code == 503:
print "[!] Service Unavailable %s " % (uri)
else:
print "[?] Unknwon"
print "\n:. FINISH :.\n"
It is working fine with this script, but it only checks only a given path
import urllib
f = urllib.urlopen("http://xxx.xxx.xxx.xxx/button.jsp")
s = f.read()
f.close()
from BeautifulSoup import BeautifulStoneSoup
soup = BeautifulStoneSoup(s)
viewstate = soup.findAll("input", {"type": "submit"})
if viewstate[0]['value'] == "Send":
print(" Yay!!!")
else:
print("No Submit Button")

Apart from what I mentioned in a comment you are not passing the returned html you are passing uri = url+"/"+dir instead of response.read() to BeautifulSoup so you are searching for the tag in uri which I imagine certainly does not contain any tags. You need to pass read as below:
response = urllib2.urlopen(uri)
if response.getcode() == 200:
soup = BeautifulStoneSoup(response.read())
If you want the first match use .find using if viewstate to make sure it matched something, you can also iterate over the file object getting a line at a time:
from BeautifulSoup import BeautifulStoneSoup
import urllib2
import sys
url = sys.argv[1]
with open("dirlist.txt",'r') as f:
for dir in f:
uri = url + "/" + dir.rstrip()
try:
response = urllib2.urlopen(uri)
if response.getcode() == 200:
soup = BeautifulStoneSoup(response.read())
viewstate = soup.find("input", {"type": "submit"})
if viewstate and viewstate.get("value") == "Send":
print("Shell is found!! Yay!!!")
else:
print("There's nothing here")
except urllib2.HTTPError, e:
if e.code == 401:
print "[!] Authorization Required %s " % (uri)
elif e.code == 403:
print "[!] Forbidden %s " % (uri)
elif e.code == 404:
print "[-] Not Found %s " % (uri)
elif e.code == 503:
print "[!] Service Unavailable %s " % (uri)
else:
print "[?] Unknwon"
print "\n:. FINISH :.\n"

Related

Extract the Background Image Url of a website from html file set in the style in Python

I'm coding a website cloner in python, It is doing fine as well for most files but I have found a challenge in getting the url of background images eg
<div style="background-image: url(images/banner.jpg)" >
The script detects background-image as a folder and assume the url is 'background_image: url(images/banner.jpg' .How do I set it to get the actual url.
Python 2.7
import urllib2
import sys
import socket
import os
import re
socket.setdefaulttimeout(15)
dataTypesToDownload = [".jpg", ".jpeg", ".png", ".gif", ".ico", ".css", ".js", ".html"]
url = 'http://example.com/'
pathbase = 'theme'
if "http://" not in url and "https://" not in url:
url = "http://"+url
try:
os.mkdir(pathbase)
except OSError:
pass
file = open(pathbase + "/index.html", "w")
try:
content = urllib2.urlopen(url).read()
except urllib2.URLError as e:
print "An error occured: " + str(e.reason)
exit()
resources = re.split("=\"|='", content)
first = False
for resource in resources:
if first == False:
first = True
continue
resource = re.split("\"|'", resource)[0]
if any(s in resource for s in dataTypesToDownload):
print "Downloading " + resource
try:
path = resource.split("/")
if len(path) != 1:
path.pop(len(path) - 1)
trail = "./" + pathbase + "/"
for folder in path:
trail += folder+"/"
try:
os.mkdir(trail)
except OSError:
pass
except IOError:
pass
try:
if "?" in resource:
download = open(pathbase + "/"+resource.split("?")[len(resource.split("?")) - 2], "w")
else:
download = open(pathbase + "/"+resource, "w")
print url+"/"+resource
dContent = urllib2.urlopen(url+"/"+resource).read()
except urllib2.URLError as e:
print "An error occured: " + str(e.reason)
download.close()
continue
except IOError:
pass
continue
download.write(dContent)
download.close()
print "Downloaded!"
file.write(content)
file.close()
I expect it when it encounter style="background-image: url(images/banner.jpg),
It should set resource to be images/banner.jpg. But it is setting the resource as background-image: url(images/images.jpg

Python script to harvest tweets to a MongoDb works with users but not hashtags. Any ideas why not?

I'm playing around the Twitter API and am in the process of developing a script to pull all Tweets with a certain hashtag down to a local mongoDB. I have it working fine when I'm downloading tweets from users, but when downloading tweets from a hashtag I get:
return loads(fp.read(),
AttributeError: 'int' object has no attribute 'read'
Can anyone offer their infinite wisdom into how I could get this script to work?
To run, save it as a .py file, cd to the folder and run:
python twitter.py
Code:
__author__ = 'Tom Cusack'
import pymongo
import oauth2 as oauth
import urllib2, json
import sys, argparse, time
def oauth_header(url, consumer, token):
params = {'oauth_version': '1.0',
'oauth_nonce': oauth.generate_nonce(),
'oauth_timestamp': int(time.time()),
}
req = oauth.Request(method = 'GET',url = url, parameters = params)
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(),consumer, token)
return req.to_header()['Authorization'].encode('utf-8')
def main():
### Twitter Settings
numtweets = '32000'
verbose = 'store_true'
retweet = 'store_false'
CONSUMER_KEY = 'M7Xu9Wte0eIZvqhb4G9HnIn3G'
CONSUMER_SECRET = 'c8hB4Qwps2aODQUx7UsyzQuCRifEp3PKu6hPQll8wnJGIhbKgZ'
ACCESS_TOKEN = '3213221313-APuXuNjVMbRbZpu6sVbETbgqkponGsZJVT53QmG'
ACCESS_SECRET = 'BJHrqWC9ed3pA5oDstSMCYcUcz2pYF3DmJ7jcuDe7yxvi'
base_url = url = 'https://api.twitter.com/1.1/search/tweets.json?include_entities=true&count=200&q=#mongodb&include_rts=%s' % (retweet)
oauth_consumer = oauth.Consumer(key = CONSUMER_KEY, secret = CONSUMER_SECRET)
oauth_token = oauth.Token(key = ACCESS_TOKEN, secret = ACCESS_SECRET)
### Mongodb Settings
uri = 'mongodb://127.0.0.1:27017/SARKY'
if uri != None:
try:
conn = pymongo.MongoClient(uri)
print 'Pulling Tweets..'
except:
print 'Error: Unable to connect to DB. Check uri variable.'
return
uri_parts = pymongo.uri_parser.parse_uri(uri)
db = conn[uri_parts['database']]
db['twitter-harvest'].ensure_index('id_str')
### Helper Variables for Harvest
max_id = -1
tweet_count = 0
stream = 0
### Begin Harvesting
while True:
auth = oauth_header(url, oauth_consumer, oauth_token)
headers = {"Authorization": auth}
request = urllib2.Request(url, headers = headers)
try:
stream = urllib2.urlopen(request)
except urllib2.HTTPError, err:
if err.code == 404:
print 'Error: Unknown user. Check --user arg'
return
if err.code == 401:
print 'Error: Unauthorized. Check Twitter credentials'
return
tweet_list = json.load(stream)
if len(tweet_list) == 0:
print 'No tweets to harvest!'
return
if 'errors' in tweet_list:
print 'Hit rate limit, code: %s, message: %s' % (tweets['errors']['code'], tweets['errors']['message'])
return
if max_id == -1:
tweets = tweet_list
else:
tweets = tweet_list[1:]
if len(tweets) == 0:
print 'Finished Harvest!'
return
for tweet in tweets:
max_id = id_str = tweet['id_str']
try:
if tweet_count == numtweets:
print 'Finished Harvest- hit numtweets!'
return
if uri != None:
db[user].update({'id_str':id_str},tweet,upsert = True)
else:
print tweet['text']
tweet_count+=1
if verbose == True and uri != None:
print tweet['text']
except Exception, err:
print 'Unexpected error encountered: %s' %(err)
return
url = base_url + '&max_id=' + max_id
if __name__ == '__main__':
try:
main()
except SystemExit as e:
if e.code == 0:
pass

You initially set stream = 0. When your try...except block catches a HTTP response with a code that isn't 404 or 401, stream is still equal to 0, but your except block doesn't break out of the function.
I'd look more closely at what this response says.

python calling apis events and printing randomly

I have a interval of 60 and wanted to print 6 events every 1 minutes. But it prints 11,12 and 13 events randomly every 1 minutes. Why is that so ? Is it because of my codes or what other factors can cause this ?
My code is -
import logging
import httplib
import simplejson as json
import socket
import time
import datetime
import urllib2
import sys
import xml.dom.minidom
from bs4 import BeautifulSoup as soup
SCHEME = """<scheme>
<title>testingCurrentWeatherSG</title>
<description>Get data from forecast.</description>
<use_external_validation>true</use_external_validation>
<streaming_mode>simple</streaming_mode>
<endpoint>
<args>
<arg name="intervalone">
<title>Intervalone</title>
<description>How long to refresh this query?</description>
</arg>
</args>
</endpoint>
</scheme>
"""
def do_scheme():
print SCHEME
## Utility functions
def fahrenheit(fahren):
return (fahren-32) * 5.0/9.0
def get_percent(num):
return num * 100.
## Responses
def get_response(conn, url):
try:
conn.request('GET', url)
result = conn.getresponse()
data = result.read()
return json.loads(data)
except socket.timeout:
return None
## Printing
def print_forecast(name, di):
# Print the forcast from 'di', for location 'name'
# name is the name of the location, di is the api response
psi_avg=20
current = di['currently']
for key, value in sorted(current.iteritems()):
if key in ['cloudCover', 'icon', 'ozone', 'precipIntensity', # time
'precipProbability', 'precipType', 'pressure', 'summary',
'visibility', 'windBearing', 'windSpeed']:
print '{0} : {1}'.format(key, value)
elif key in ['temperature', 'dewPoint']:
print '%s: %.2f' % (key, fahrenheit(value))
elif key == 'humidity':
print '%s: %.2f' % (key, get_percent(value))
print 'psiAverage : ' + str(psi_avg)
print 'latitude : ' + str(di['latitude'])
print 'longitude : ' + str(di['longitude'])
print 'location : ' + str(name)
print
def weather_Connection(intervalone):
host = 'api.forecast.io'
conn = httplib.HTTPSConnection(host, timeout=60) # adjust timeout as desired
try:
urlnyp = '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.37871,103.848808'
conn.request('GET', urlnyp)
resultnyp = conn.getresponse()
contentnyp = resultnyp.read()
except socket.timeout:
print 'socket timeout'
return
# the locations and urls for the api calls
urls = {
'Choa Chu Kang': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.394557,103.746396',
'Kallang': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.311469,103.871399',
'Jurong West': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.352008,103.698599',
'Redhill': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.289732,103.81675',
'Tampines': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.353092,103.945229',
'Yishun': '/forecast/59ff8cb7661d231f2967c2663c0a3bdc/1.429463,103.84022',
}
responses = {}
for i, (name, url) in enumerate(sorted(urls.iteritems())):
response = get_response(conn, url)
if not response:
print 'socket timeout on url#%d: %s' % (i, url)
return
responses[name] = response
conn.close()
# print the forecast
for name, data in responses.iteritems():
print_forecast(name, data)
def get_config():
#Read XML Configuration data passed from splunkd on stdin
config = {}
try:
# read everything from stdin
config_str = sys.stdin.read()
# parse the config XML
doc = xml.dom.minidom.parseString(config_str)
root = doc.documentElement
conf_node = root.getElementsByTagName("configuration")[0]
if conf_node:
logging.debug("XML: found configuration")
stanza = conf_node.getElementsByTagName("stanza")[0]
if stanza:
stanza_name = stanza.getAttribute("name")
if stanza_name:
logging.debug("XML: found stanza " + stanza_name)
config["name"] = stanza_name
params = stanza.getElementsByTagName("param")
for param in params:
param_name = param.getAttribute("name")
logging.debug("XML: found param '%s'" % param_name)
if param_name and param.firstChild and \
param.firstChild.nodeType == param.firstChild.TEXT_NODE:
data = param.firstChild.data
config[param_name] = data
logging.debug("XML: '%s' -> '%s'" % (param_name, data))
if not config:
raise Exception, "Invalid configuration received from Splunk."
except Exception, e:
raise Exception, "Error getting Splunk configuration via STDIN: %s" % str(e)
return config
def run():
#The Main function that starts the action. The thread will sleep for however many seconds are configured via the Input.
# config = get_config()
#
#
# intervalone = config["intervalone"]
intervalone =60
while True:
weather_Connection(intervalone)
logging.info("Sleeping for %s seconds" %(intervalone))
time.sleep(float(intervalone))
if __name__ == '__main__':
if len(sys.argv) > 1:
if sys.argv[1] == "--scheme":
do_scheme()
else:
run()
sys.exit(0)

I've checked and tried your code and it works fine. Try replacing
logging.info("Sleeping for %s seconds" %(intervalone))
with
print("Sleeping for %s seconds" % (intervalone))
You should see this statement every 6 forecasts.
Note: why returning from weather_Connection() here
for i, (name, url) in enumerate(sorted(urls.iteritems())):
response = get_response(conn, url)
if not response:
print 'socket timeout on url#%d: %s' % (i, url)
return
responses[name] = response
You can just skip it with continue
for i, (name, url) in enumerate(sorted(urls.iteritems())):
response = get_response(conn, url)
if not response:
print 'socket timeout on url#%d: %s' % (i, url)
continue
responses[name] = response

python 3 - urllib issue

I'm using python 3.3.0 in Windows 7.
I have two files: dork.txt and fuzz.py
dork.txt contains following:
/about.php?id=1
/en/company/news/full.php?Id=232
/music.php?title=11
fuzz.py contains following:
srcurl = "ANY-WEBSITE"
drkfuz = open("dorks.txt", "r").readlines()
print("\n[+] Number of dork names to be fuzzed:",len(drkfuz))
for dorks in drkfuz:
dorks = dorks.rstrip("\n")
srcurl = "http://"+srcurl+dorks
requrl = urllib.request.Request(srcurl)
#httpreq = urllib.request.urlopen(requrl)
# Starting the request
try:
httpreq = urllib.request.urlopen(requrl)
except urllib.error.HTTPError as e:
print ("[!] Error code: ", e.code)
print("")
#sys.exit(1)
except urllib.error.URLError as e:
print ("[!] Reason: ", e.reason)
print("")
#sys.exit(1)
#if e.code != 404:
if httpreq.getcode() == 200:
print("\n*****srcurl********\n",srcurl)
return srcurl
So, when I enter the correct website name which has /about.php?id=1, it works fine.
But when I provide the website which has /en/company/news/full.php?Id=232, it first
prints Error code: 404 and then gives me the following error: UnboundLocalError: local
variable 'e' referenced before assignment or UnboundLocalError: local variable 'httpreq' referenced before assignment
I can understand that if the website doesn't have the page which contains /about.php?id=1, it gives Error code: 404 but why it's not going back in the for loop to check the remaining dorks in the text file??? Why it stops here and throws an error?
I want to make a script to find out valid page from just a website address like: www.xyz.com

When the line urllib.request.urlopen(requrl) expression throws an exception, the variable httpreq is never set. You could set it to None before the try statement, then test if it is still None afterwards:
httpreq = None
try:
httpreq = urllib.request.urlopen(requrl)
# ...
if httpreq is not None and httpreq.getcode() == 200:

srcurl = "ANY-WEBSITE"
drkfuz = open("dorks.txt", "r").readlines()
print("\n[+] Number of dork names to be fuzzed:",len(drkfuz))
for dorks in drkfuz:
dorks = dorks.rstrip("\n")
srcurl = "http://"+srcurl+dorks
try:
requrl = urllib.request.Request(srcurl)
if requrl != None and len(requrl) > 0:
try:
httpreq = urllib.request.urlopen(requrl)
if httpreq.getcode() == 200:
print("\n*****srcurl********\n",srcurl)
return srcurl
except:
# Handle exception
pass
except:
# Handle your exception
print "Exception"
Untested code, but it will work logically.

Python, checking if a proxy is alive?

The code:
for item in pxfile.readlines():
if is_OK(item):
sys.stdout.write(item + "is not OK.")
item = make(item)
item = "#" + item
resfile.write(item)
else:
sys.stdout.write(item)
sys.stdout.write("is OK.")
line = make(item)
resfile.write(item)
If is_OK is true it means that the proxy doesn't exist, should fix that.
def is_OK(ip):
try:
proxy_handler = urllib2.ProxyHandler({'http': ip})
opener = urllib2.build_opener(proxy_handler)
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib2.install_opener(opener)
req=urllib2.Request('http://www.icanhazip.com')
sock=urllib2.urlopen(req)
except urllib2.HTTPError, e:
#print 'Error code: ', e.code
return e.code
except Exception, detail:
#print "ERROR:", detail
return 1
return 0
It takes 10 minutes to get a list like this:
141.219.252.132:68664
is OK.118.174.0.155:8080
is OK.91.194.246.169:8080
is not OK.91.194.246.81:8080
is OK.201.245.110.138:8888
is OK.202.43.178.31:3128
is OK.202.109.80.106:8080
Is there a way to make it faster?
It's formatted badly, I tried removing the newline with strip()
but no luck.
Any ideas?

You should use threads to make the code run quicker :
import urllib2, threading
def is_OK(ip):
print 'Trying %s ...' % ip
try:
proxy_handler = urllib2.ProxyHandler({'http': ip})
opener = urllib2.build_opener(proxy_handler)
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib2.install_opener(opener)
req=urllib2.Request('http://www.icanhazip.com')
urllib2.urlopen(req)
print '%s is OK' % ip
except urllib2.HTTPError:
print '%s is not OK' % ip
except Exception:
print '%s is not OK' % ip
a = threading.Thread(None, is_OK, None, ("hostname1",), None)
a.start()
b = threading.Thread(None, is_OK, None, ("hostname2",), None)
b.start()

First idea, set a shorter timeout than default one
timeout = 10
sock=urllib2.urlopen(req, None, timeout)
You may also use threading so you can test several connections simultaneously.

And for formatting, using strip() that way should be ok:
for line in pxfile:
item = line.strip()
if is_OK(item):
sys.stdout.write(item + " is not OK.\n")
resfile.write("# " + make(item) +"\n")
else:
sys.stdout.write(item + " is OK.\n")
resfile.write(make(item) +"\n")

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Beautiful Soup Error : list index out of range - python

Related

Extract the Background Image Url of a website from html file set in the style in Python

Python script to harvest tweets to a MongoDb works with users but not hashtags. Any ideas why not?

python calling apis events and printing randomly

python 3 - urllib issue

Python, checking if a proxy is alive?

Categories

Resources