I am trying to create a threaded subdomain enumerator, and every time it's done enumerating subdomains on sites like google.com the program just freezes and doesn't exit so it doesn't print the final line print("\\n\[+\] Done enumerating").
import requests
import validators
from concurrent.futures import ThreadPoolExecutor
def enumerate_subdomain(subdomain):
http_url = f"http://{subdomain}.{domain}"
https_url = f"https://{subdomain}.{domain}"
try:
if validators.url(https_url) == True:
requests.get(https_url)
print(f"[!] Subdomain found: {https_url}")
else:
requests.get(http_url)
print(f"[!] Subdomain found: {http_url}")
except requests.ConnectionError:
pass
domain = input("[+] Domain (e.g. example.com): ")
file = input("[+] Path to wordlist file: ")
threads = int(input("[+] Amount of threads (10-50 is recommended): "))
with open(file, "r") as f:
content = f.read()
subdomains = content.splitlines()
if __name__ == "__main__":
with ThreadPoolExecutor(max_workers=threads) as executor:
executor.map(enumerate_subdomain, subdomains)
executor.shutdown(wait=False)
print("\n[+] Done enumerating")
I tried adding the following line executor.shutdown(wait=True) after executor.map but it still doesn't work
Related
I have a list of IP addresses like 1000 no's. I am reading the ip_file.txt and storing the result file as result_date.txt. Below is the code that I achieved the result. But my issue is it's taking too long to execute the entire files. Can anyone suggest multithreading, please so that the desired result can be achieved quickly? Thanks in advance.
#!/usr/bin/env python
import os
import csv
import paramiko
from datetime import datetime
import time
import sys
import re
from collections import defaultdict
# Verifies your os type
from paramiko import file
OS_TYPE = os.name
# Sets the count modifier to the os type
count = '-n' if OS_TYPE == 'nt' else '-c'
def create_ip_list():
ip_list = []
with open("ip_file.txt", "r") as file:
for line in file:
ip_list.append(line.strip())
return ip_list
# fetching data
now = datetime.now()
dat = now.strftime("%d/%m/%Y")
# time = now.strftime("%H:%M:%S")
date_string = dat.replace('/', '-')
timestr = time.strftime("%d%m%Y-%H%M%S")
def ping_device(ip_list):
"""Ping ip_list and return results
return: None
rtype: None
"""
results_file = open("results_" + str(timestr) + ".txt", "w")
for ip in ip_list:
response = os.popen(f"ping {ip} {count} 1").read()
time.sleep(1.5)
#fetch Average time
print(response)
for i in response.split("\n"):
para = i.split("=")
try:
if para[0].strip() == "Minimum":
latency = para[3].strip()
print(latency)
# output1=latency[0:8].split(" ")
# test=output1[0]
# print(test)
except:
print("time run")
if "Received = 1" and "Approximate" in response:
#print(f"UP {ip} Ping Successful")
results_file.write(f"{ip},UP,{latency}" + "\n")
else:
print(f"Down {ip} Ping Unsuccessful")
results_file.write(f"{ip} Down" + "\n")
results_file.close()
if __name__ == "__main__":
ping_device(create_ip_list())
Write a function ping_one_device that takes a single ip and returns a single string giving the status. It should be easy to pull this out of ping_device.
Then
with open(results_file, "w") as results_file:
with ThreadPoolExecutor() as executor:
for result in map(ping_one_device, ip_list):
results_file.write(result)
I'm trying to Open a file as text, choosing the file via str(raw_input, but it doesn't work with def main():.
My second script, with normal (No threading), is using try:. The threading one not opening the email.
Here is a link to the scripts in ghostbin:
Normal Script Without Threading and Multithread Script ,
The Normal Script Without Threading
try:
filelist = str(raw_input("[+] Enter Mailist File: "))
loglive = open('live.txt','a')
logdie = open('die.txt','a')
logun = open('uncheck.txt','a')
except Exception as Err:
print("ERROR : File List Not Found!")
sys.exit()
list = open(filelist, "r")
while True:
email = list.readline().replace("\n","")
if not email:
break
Multithread Script
def main():
start_time = time.time()
tanya = str(raw_input("[+] Enter Mailist File: "))
mailist = open(tanya, 'r')
while True:
email = mailist.readline().replace("\n","")
if not email:
break
s = requests.session()
You need to call the main function for it to be executed. At the end of your file, add:
if __name__ == '__main__':
main()
The reason for this can be found here: What does if __name__ == "__main__": do?
Wrote this crawler in Python, it dumps several parameters to JSON output file based on the input list of domains.
Have this question:
Do I need to close the HTTP connection in each thread? Input data is ca. 5 Million items. It process at the beginning at a rate ca. 50 iterations per second, but later after some time it drops to 1-2 per second and/or hangs (no kernel messages and no errors on stdout)? Can this be code or is network limiting related? I suspect software since when I restart it, it starts again with high rate (ca. 50 iteration per second)
Any tips how to improve the code below are also welcome, especially improve on speed and crawling throughput.
Code in questions:
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\""+item+"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":\"http://"+ourl.rstrip()+"\",\"IP:\""+ip+"\"," + "\"Server\":"+ "\""+str(header.getheader("Server")).replace("None","")+"\",\"PoweredBy\":" + "\""+str(header.getheader("X-Powered-By")).replace("None","")+"\""+",\"MetaGenerator\":\"\",\"Email\":\""+email+"\",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
#print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
Update 1:
Closing the Socket and FileDescriptor makes it work better, does not seem to hang anymore after some time. Performance is 50 reqs/sec on home laptop and ca 100 req/sec on a VPS
from threading import Thread
import httplib, sys
import urllib2
import pprint
from tqdm import tqdm
import lxml.html
from Queue import Queue
from geoip import geolite2
import pycountry
from tld import get_tld
import json
resfile = open("out.txt",'a')
concurrent = 200
def doWork():
while True:
url = q.get()
status = getStatus(url)
doSomethingWithResult(status)
q.task_done()
def getStatus(ourl):
try:
response = urllib2.urlopen("http://"+ourl)
realsock = response.fp._sock.fp._sock
peer = response.fp._sock.fp._sock.getpeername()
ip = peer[0]
header = response.info()
html = response.read()
realsock.close()
response.close()
html_element = lxml.html.fromstring(html)
generator = html_element.xpath("//meta[#name='generator']/#content")
try:
match = geolite2.lookup(ip)
if match is not None:
country= match.country
try:
c=pycountry.countries.lookup(country)
country=c.name
except:
country=""
except:
country=""
try:
res=get_tld("http://www"+ourl, as_object=True)
tld=res.suffix
except:
tld=""
try:
match = re.search(r'[\w\.-]+#[\w\.-]+', html)
email=match.group(0)
except:
email=""
try:
item= generator[0]
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\",\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":"+json.dumps(item)+",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
except:
val = "{ \"Domain\":"+json.dumps("http://"+ourl.rstrip())+",\"IP\":\""+ip+"\"," + "\"Server\":"+json.dumps(str(header.getheader("Server")).replace("None",""))+",\"PoweredBy\":" +json.dumps(str(header.getheader("X-Powered-By")).replace("None",""))+",\"MetaGenerator\":\"\",\"Email\":"+json.dumps(email)+",\"Suffix\":\""+tld+"\",\"CountryHosted\":\""+country+"\" }"
return val
except Exception as e:
print "error"+str(e)
pass
def doSomethingWithResult(status):
if status:
resfile.write(str(status)+"\n")
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for url in tqdm(open('list.txt')):
q.put(url.strip())
status = open("status.txt",'w')
status.write(str(url.strip()))
q.join()
except KeyboardInterrupt:
sys.exit(1)
The handles will be automatically garbage collected, but, you will be better off closing the handles yourself, especially as you are doing this in a tight loop.
You also asked for suggestions for improvement. A big one would be to stop using urllib2 and start using requests instead.
There are many possible options, why your crawling rate drops.
1.) Take care not to crawl to much data from the same domain. Some web servers are configured just to allow one connection per IP address in parallel.
2.) Try to send randomized browser-like http headers (user-agent, referrer, ...) to prevent web server scraping protection, if set.
3.) Use a mature http (parallel) library, like pycurl (has MultiCurl) or requests (grequests). They perform faster for sure.
I'm in the process of creating a program that takes an IP address, performs an nmap scan, and takes the output and puts it in a text file. The scan works fine, but I can't seem to figure out why it's not writing anything to the text file.
Here is what I have so far
if __name__ == "__main__":
import socket
import nmap
import sys
import io
from libnmap.parser import NmapParser, NmapParserException
from libnmap.process import NmapProcess
from time import sleep
from os import path
#Program Banner
if len(sys.argv) <= 1:
print(
"""
test
""")
sys.exit()
#Grab IP Address as argument
if len(sys.argv)==2:
ip = sys.argv[1]
print "\n[+] Reading IP Address"
#Function - Pass IP to Nmap then start scanning
print "\n[+] Passing " + ip + " to Nmap..."
print("\n[+] Starting Nmap Scan\n")
def nmap_scan(ip, options):
parsed = None
nmproc = NmapProcess(ip, options)
rc = nmproc.run()
if rc != 0:
print("nmap scan failed: {0}".format(nmproc.stderr))
try:
parsed = NmapParser.parse(nmproc.stdout)
except NmapParserException as e:
print("Exception raised while parsing scan: {0}".format(e.msg))
return parsed
#Function - Display Nmap scan results
def show_scan(nmap_report):
for host in nmap_report.hosts:
if len(host.hostnames):
tmp_host = host.hostnames.pop()
else:
tmp_host = host.address
print("Host is [ %s ]\n" % str.upper(host.status))
print(" PORT STATE SERVICE")
for serv in host.services:
pserv = "{0:>5s}/{1:3s} {2:12s} {3}".format(
str(serv.port),
serv.protocol,
serv.state,
serv.service)
if len(serv.banner):
pserv += " ({0})".format(serv.banner)
print(pserv)
#Function - Define output text file name & write to file
def createFile(dest):
name = "Enumerator-Results.txt"
if not(path.isfile(dest+name)):
f = open(dest+name,"a+")
f.write(show_scan(report))
f.close()
if __name__ == "__main__":
report = nmap_scan(ip, "-sV")
if report:
destination = "/root/Desktop/"
createFile(destination)
show_scan(report)
print "\nReport Complete!"
else:
print("No results returned")
You're using print statements in your show_scan() function. Instead try passing the file reference to show_scan() and replacing the print() calls with f.write() calls. This would save to file everything you're currently printing to the terminal.
Alternatively you could just change your code so that the show_scan is separate from the f.write().
ie change
f.write(show_scan(report))
to
f.write(report)
It depends on whether you want to save the raw output or what you're printing to the screen.
Also you will need to pass the reference of the report to createFile so that it has the report to print ie
createFile(destination, report)
Just make sure you are always calling f.write() with a string as its parameter.
#Function - Define output text file name & write to file
def createFile(dest, report):
name = "Enumerator-Results.txt"
if not(path.isfile(dest+name)):
f = open(dest+name,"a+")
f.write(report)
f.close()
if __name__ == "__main__":
report = nmap_scan(ip, "-sV")
if report:
destination = "/root/Desktop/"
createFile(destination, report)
show_scan(report)
print "\nReport Complete!"
else:
print("No results returned")
I'm writing a Python web crawler and I want to make it multi-threaded. Now I have finished the basic part, below is what it does:
a thread gets a url from the queue;
the thread extracts the links from the page, checks if the links exist in a pool (a set), and puts the new links to the queue and the pool;
the thread writes the url and the http response to a csv file.
But when I run the crawler, it always gets stuck eventually, not exiting properly. I have gone through the official document of Python but still have no clue.
Below is the code:
#!/usr/bin/env python
#!coding=utf-8
import requests, re, urlparse
import threading
from Queue import Queue
from bs4 import BeautifulSoup
#custom modules and files
from setting import config
class Page:
def __init__(self, url):
self.url = url
self.status = ""
self.rawdata = ""
self.error = False
r = ""
try:
r = requests.get(self.url, headers={'User-Agent': 'random spider'})
except requests.exceptions.RequestException as e:
self.status = e
self.error = True
else:
if not r.history:
self.status = r.status_code
else:
self.status = r.history[0]
self.rawdata = r
def outlinks(self):
self.outlinks = []
#links, contains URL, anchor text, nofollow
raw = self.rawdata.text.lower()
soup = BeautifulSoup(raw)
outlinks = soup.find_all('a', href=True)
for link in outlinks:
d = {"follow":"yes"}
d['url'] = urlparse.urljoin(self.url, link.get('href'))
d['anchortext'] = link.text
if link.get('rel'):
if "nofollow" in link.get('rel'):
d["follow"] = "no"
if d not in self.outlinks:
self.outlinks.append(d)
pool = Queue()
exist = set()
thread_num = 10
lock = threading.Lock()
output = open("final.csv", "a")
#the domain is the start point
domain = config["domain"]
pool.put(domain)
exist.add(domain)
def crawl():
while True:
p = Page(pool.get())
#write data to output file
lock.acquire()
output.write(p.url+" "+str(p.status)+"\n")
print "%s crawls %s" % (threading.currentThread().getName(), p.url)
lock.release()
if not p.error:
p.outlinks()
outlinks = p.outlinks
if urlparse.urlparse(p.url)[1] == urlparse.urlparse(domain)[1] :
for link in outlinks:
if link['url'] not in exist:
lock.acquire()
pool.put(link['url'])
exist.add(link['url'])
lock.release()
pool.task_done()
for i in range(thread_num):
t = threading.Thread(target = crawl)
t.start()
pool.join()
output.close()
Any help would be appreciated!
Thanks
Marcus
Your crawl function has an infinite while loop with no possible exit path.
The condition True always evaluates to True and the loop continues, as you say,
not exiting properly
Modify the crawl function's while loop to include a condition. For instance, when the number of links saved to the csv file exceeds a certain minimum number, then exit the while loop.
i.e.,
def crawl():
while len(exist) <= min_links:
...