I'm on working on my tool.
So I have this function :
import subprocess, os, platform, ctypes, requests, random, threading
from bs4 import BeautifulSoup as bs
temptotal = 0
totalurl = 0
retry = 0
load = 0
load2 = 0
loaded = 0
dorksdone = 0
tempourl = 0
#Import Proxy List
selecting = 1
while selecting == 1:
try:
option = int(input("Choose Type Proxy(1 = http, 2=socks4, 3 = socks5) :")
except:
option = 404
if option == 1:
selecting = 0
prox = 'http'
proxyyyy = 'http'
elif option == 2:
selecting = 0
prox = 'socks4'
proxyyyy = 'socks4'
elif option == 3:
selecting = 0
prox = 'socks5'
proxyyyy = 'socks5'
else:
print("Choose valid numbre such as 1, 2 or 3!")
proxy_list = input("Give me Proxylist :" )
with open(proxy_list, mode="r", encoding="utf-8") as mf:
for line in mf:
load2 += 1
print(" ")
print("Total Proxy loaded :" + str(load2))
print(" ")
#import keywordfile
dorkslist = input("Give me KeywordList/Dorklist :" + bcolors.ENDC + " ")
with open(dorkslist, mode="r", encoding="utf-8") as mf:
for line in mf:
load += 1
mf.close()
print(" ")
print("Total Dorks loaded:" + str(load))
print(" ")
#define url to check
yahoourl = {"https://fr.search.yahoo.com/search?p=&fr=yfp-search-sb",
"https://fr.search.yahoo.com/search?p=&fr=yfp-search-sb&b=11&pz=10"}
#funtion i want to speed up
def checker():
global temptotal
global loaded
global dorksdone
global tempourl
proxy = set()
with open(proxy_list, "r") as f:
file_lines1 = f.readlines()
for line1 in file_lines1:
proxy.add(line1.strip())
with open(dorkslist, mode="r",encoding="utf-8") as my_file:
for line in my_file:
loaded += 1
threading.Thread(target=titre).start()
indorks = line
encode = requote_uri(indorks)
for yahoo in yahoourl:
yahooo = yahoo.replace("&fr",encode + "&fr")
try:
proxies = {
'http': prox+'://'+random.choice(list(proxy))
}
r = requests.get(yahooo, proxies=proxies)
print("Dorks used :" + indorks )
dorksdone += 1
soup = bs(r.text, 'html.parser')
links = soup.find_all('a')
for link in soup.find_all('a'):
a = link.get('href')
unquote(a)
temptotal += 1
with open("Bing.txt", mode="a",encoding="utf-8") as fullz:
fullz.write(a + "\n")
fullz.close()
lines_seen = set() # holds lines already seen
outfile = open("Bingnodup.txt", "w", encoding="utf-8")
for line in open("Bing.txt", "r", encoding="utf-8"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
with open("Bingnodup.txt", mode="r", encoding="utf-8") as cool:
for url in cool:
try:
proxies = {
'http': prox+'://'+random.choice(list(proxy))
}
response = requests.get(url, proxies=proxies)
save = response.url
with open("Bingtemp.txt", mode="a", encoding="utf-8") as cool1:
cool1.write(save + "\n")
tempourl += 1
cool1.close()
except:
pass
except:
raise
fin()
#start bot
bot1 = threading.Thread(target=checker)
bot1.start()
bot1.join()
Exemple file for Keyword:
python
wordpress
Exemple file for proxy(http so take 1 on choice) :
46.4.96.137:8080
223.71.167.169:80
219.248.205.117:3128
198.24.171.34:8001
51.158.123.35:9999
But this function when running is very very very slow, could who let me know how i can give boost to this function ?
Because i have try to use this topic: How can I use threading in Python?
But i didn't understand how to build in into the right way for my function.
Your script is what's called I/O bound. What this means is that it is not slow because the CPU needs to perform long computations, but because it needs to wait a lot every time it requests a URL (the bottleneck are the requests to the internet).
For concurrency you have 3 options:
asyncio
threading
multiprocessing
The first two are the ones which can help you in I/O bound problems like yours. The first one is the recommended approach in a problem like this, since there is a library available with support for async/await.
This is an adapted example from the above link, which does exactly what you need:
import asyncio
import time
import aiohttp
def get_proxies():
if platform.system() == "Linux":
clear = lambda: os.system('clear')
clear()
if platform.system() == "Windows":
clear = lambda: os.system('cls')
clear()
proxy = set()
with open("proxy.txt", "r") as f:
file_lines1 = f.readlines()
for line1 in file_lines1:
proxy.add(line1.strip())
return proxy
async def download_site(session, url, proxies):
async with session.get(url, proxies=proxies) as response:
save = response.url
with open("Yahootemp.txt", mode="a", encoding="utf-8") as cool1:
cool1.write(save + "\n")
async def download_all_sites(sites, proxies):
async with aiohttp.ClientSession() as session:
tasks = []
for url in sites:
task = asyncio.ensure_future(download_site(session, url, proxies))
tasks.append(task)
await asyncio.gather(*tasks, return_exceptions=True)
if __name__ == "__main__":
proxies = get_proxies()
proxies = {
'http': prox + '://' + random.choice(list(proxies))
}
sites = []
with open("Yahoonodup.txt", mode="r", encoding="utf-8") as cool:
for url in cool:
sites.append(url)
asyncio.get_event_loop().run_until_complete(download_all_sites(sites, proxies))
You could make it even faster if saving the files seems to still be too slow; read this.
Related
I have written the following code and it works flawlessly but at one moment it starts typing out "programm ootab 3 minutit" after every 3 minutes. It means.. it skips all the print commands in between in the loops without throwing ANY errors. I have no idea how it can behave like that and it happens at random.. not during a specific loop or anything.
import praw
from time import sleep
import requests
from csv import DictReader
import os
#rotating proxy, 3 min, public proxies
os.environ["HTTPS_PROXY"] = "http://xxx.xxx.255.118:xxxxx"
os.environ["HTTP_PROXY"] = "http://xxx.xxx.255.118:xxxxx"
nr = 0
kommi_nr = 0
with open('file.csv', 'r') as read_obj:
csv_dict_reader = DictReader(read_obj)
for row in csv_dict_reader:
#connection with api
platform = praw.Platform(
client_id = row['client_id'],
client_secret = row['client_secret'],
user_agent = row['username'],
username = row['username'],
password = row['password'],
ratelimit_seconds = 930,
)
with open("posts.txt", 'a') as posted_comments:
with open("source.txt", encoding="utf8") as subs:
content = subs.readlines()
for line in content:
for post in platform.subplatform(content[nr].rstrip()).new(limit = 1):
if post.locked:
nr += 1
comment_done = False
else:
with open("spinned_comments.txt") as spinned_comments:
kindel_kommi_rida = spinned_comments.readlines()
post.reply(kindel_kommi_rida[kommi_nr])
print(row['username'] + " posted to " + post.permalink + " from IP: " + requests.get("https://icanhazip.com", timeout=1.5).text.strip())
print("KOMMENTAAR: " + kindel_kommi_rida[kommi_nr])
posted_comments.write(row['username'] + "\n")
kommi_nr +=1
nr += 1
comment_done = True
if comment_done:
break
print("programm ootab 3 minutit")
sleep(180)
I am using multi-threading to visit links read from a csv, strangely irrespective of the max-workers or even when I remove the multi-threading part, the code runs for an arbitrarily lower number of urls than in the list. I print the list to verify the count. For e.g if the list has 5000 urls, the code stops at 4084, if the links are 13,000 it will stop at 9200, even when it is just 130 links it will stop at 80 or something. What am I doing wrong here?
import requests
import xlrd
import concurrent.futures
from bs4 import BeautifulSoup
import csv
header_added = False
file_location = "Urls.xlsx"
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row,0))
print(len(all_links))
i = 0
def get_solution(url):
global header_added, i
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')
ques_div = soup.find('p', class_='header-description')
ques = ques_div.find('span').text
ans_divs = soup.findAll('div', class_='puzzle-solution')
ans = ans_divs[0].text
print("Solution ", i)
i += 1
dict1 ={"Words": ques, "Solution": ans}
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
result = executor.map(get_solution, all_links)
Here's a reworking of your code that doesn't need locks – instead, there's only ever one process that writes to the file.
Also, due to the GIL, using a ThreadPool will be slower than a process-backed Pool.
import csv
import multiprocessing
import requests
import xlrd
from bs4 import BeautifulSoup
sess = requests.Session()
def get_solution(url):
try:
resp = sess.get(url)
resp.raise_for_status()
page = resp.text
soup = BeautifulSoup(page, "html.parser")
ques_div = soup.find("p", class_="header-description")
ques = ques_div.find("span").text.strip()
ans_divs = soup.findAll("div", class_="puzzle-solution")
ans = ans_divs[0].text.strip()
return {"URL": url, "Words": ques, "Solution": ans, "Error": ""}
except Exception as exc:
print(url, "Error:", exc)
return {"URL": url, "Words": "", "Solution": "", "Error": str(exc)}
def read_links(file_location):
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_index(0)
all_links = []
for row in range(1, 11000):
all_links.append(sheet.cell_value(row, 0))
return all_links
def main():
links = read_links("./Urls.xlsx")
with open("Results10k.csv", "w", encoding="utf-8") as f:
with multiprocessing.Pool() as p: # (or multiprocessing.pool.ThreadPool)
for i, result in enumerate(p.imap_unordered(get_solution, links, chunksize=16)):
if i == 0:
writer = csv.DictWriter(f, result.keys())
writer.writeheader()
writer.writerow(result)
f.flush() # Ensure changes are written immediately
if i % 100 == 0: # Progress indicator
print(i)
if __name__ == "__main__":
main()
It could be, that get_solution() crashes for some of the URLs. You could add a try/except in the body of the function and write all crashed URLS to a different file.
def get_solution(url):
try:
...
except:
with open('errors.txt','a+') as f:
f.write(url+'\n')
If this is the problem the numbers should add up to the total number.
Also open() is probably not thread safe.
file_lock = threading.Lock()
def get_solution(url):
with file_lock:
with open('Results10k.csv', 'a+', encoding='utf-8') as f:
w = csv.DictWriter(f, dict1.keys())
...
I'm parsing data from a text file ('placlog.txt') that is continuously being updated. As I run the code everything prints as expected, but if there are any updates to the placlog file while the code is running it is not printed.
The placlog file is being updated by a third-party program and I am using the above code to read the file and print any updates.
Once formatted, the text should be sent via a Telegram API. This part is also working initially.
import urllib.parse
import time
import requests
import os
def post_to_telegram(msg):
#print(msg)
base_url = 'https://api.telegram.org/bot&text="{}'.format(msg)
requests.get(base_url)
def check_url_inMsgList(stringToMatch, msgList):
for i in msgList:
if (stringToMatch in i):
return False
return True
try:
f = open("oldFile.txt", "r")
msgList = f.read().split("\n")
f.close()
except:
f = open("oldFile.txt", "w")
msgList = []
f.close()
selections = []
urr = ""
name = ""
pie = ""
ourLines = 2400
url_found = 0
name_found = 0
pie_found = 0
while (True):
file1 = open('placlog.txt', 'r')
Lines = file1.readlines()
file1.close()
while (True):
# print("-------------------------------")
if (ourLines == len(Lines)):
break
elif (ourLines > len(Lines)):
ourLines = 0
else:
txt = Lines[ourLines].strip()
tlist = txt.split("&")
ourLines = ourLines + 1
for subtxt in tlist:
if "eventurl=" in subtxt:
a = subtxt[9:len(subtxt) - 3]
url = "www.awebsite.com/%23" + a.replace("%23", "/")
#url = url.replace("%23", "#")
for i in range(10):
if "F" + str(i) + "/" in url:
url = url.split("F" + str(i) + "/")[0] + "F" + str(i) + "/"
urr = url
url_found = 1
elif "bit=" in subtxt:
name = urllib.parse.unquote(subtxt[4:len(subtxt)])
name_found = 1
elif "pie\":" in subtxt:
a = subtxt.split("price")[1]
pie = a.split("\"")[2]
pie = float(pie)
pie = round(pie, 1)
pie = str(pie)
pie_found = 1
selections.append(url + name + pie)
msg = (url + " " + name + " " + pie)
stringToFind = url + " " + name
if (check_url_inMsgList(stringToFind, msgList)):
post_to_telegram(msg)
msgList.append(msg)
print(msg)
f = open("oldFile.txt", "a+")
f.write(msg + "\n")
f.close()
time.sleep(0.5)
elif "minodds=" in subtxt:
a = subtxt.split("minodds=")[1]
pie = a.split("&")[0]
pie = float(pie)
rie = round(pie, 1)
pie = str(pie)
pie_found = 1
selections.append(url + name + pie)
msg = (url + " " + name + " " + pie)
stringToFind = url + " " + name
if (check_url_inMsgList(stringToFind, msgList)):
post_to_telegram(msg)
msgList.append(msg)
print(msg)
f = open("oldFile.txt", "a+")
f.write(msg + "\n")
f.close()
time.sleep(0.5)
time.sleep(1)
I would recommend using watchdog, and seeing if that helps your situation. It can monitor for file system changes, so you could define a function which is executed when the placlog.txt file is changed/updated.
A good guide can be found here: http://thepythoncorner.com/dev/how-to-create-a-watchdog-in-python-to-look-for-filesystem-changes/
From that guide, you can simply change the functions defined to suit your needs i.e.
def on_modified(event):
if event.src_path == "path/to/placlog.txt":
with open('placlog.txt', 'r') as placlog:
lines = file1.readlines()
Could you try this out and see if it helps? I still recommend the with statement for file i/o since you always want your file to close no matter what.
This link might also be useful since they are also monitoring a single .txt file: Python Watchdog - src_path inconsistent
watchdog documentation: https://pythonhosted.org/watchdog/
Note: Deleted the old answer since you clarified the question.
I have written the following to automate the parsing of text to a telegram bot from a .txt file that is continuously being updated.
import urllib.parse
import time
import requests
def post_to_telegram(msg):
print(msg)
base_url = 'https://api.telegram.org/bot&text="{}"'.format(msg)
requests.get(base_url)
urr = ""
name = ""
price = ""
ourLines=0
while(True):
file1 = open('example.txt', 'r')
Lines = file1.readlines()
time.sleep(1)
while(True):
if(ourLines==len(Lines)):
break
else:
txt = Lines[ourLines].strip()
tlist = txt.split("&")
ourLines=ourLines+1
for subtxt in tlist:
if "eventurl=" in subtxt:
a = subtxt[9:len(subtxt) - 3]
url = 'www.bbey43.com/#'+a.replace("%23", "/")
#print(url)
urr = url
elif "bet=" in subtxt:
name = urllib.parse.unquote(subtxt[4:len(subtxt)])
#print(name)
elif "price\":" in subtxt:
a = subtxt.split("price")[1]
price = a.split("\"")[2]
#print(price)
post_to_telegram(urr + " "+ name + " " + price)
the 'name' & 'price' is successfully posted to the bot, but the 'url' doesn't post correctly. The only thing that gets through is "bbey43.com/#/
The solution to this was rather simple in the end. As the "#" was apart of a URL it required special formatting when being parsed.
Simply adding %23 instead of # solved it.
The program is intended to display a map with pins showing the locations of institutions that make use of one of our facilities. The program takes a csv file, reads the postcodes, geocodes them and places the pins on the map. The size of the pins is relative to the number of times they have used the facility.
However, when the csv file is uploaded the program generates a map with all the pins over Nigeria. Looking through the output from the program, it seems to be geocoding correctly so I am not sure what is going on. The program geocodes using an offline database as python's urllib is not compatible with the proxy setup at my office.
The program is split into two separate modules, the map generation module and the geocoding module.
Here is the map generation part:
import folium
from bottle import route, run, template, static_file, request
import urllib.request
import urllib.parse
import json
import os
os.system('start geocoder.bat')
institutionList = []
map_osm = folium.Map(location=[55, -2], zoom_start=5)
#route('/spreadsheet.html')
def send_static():
return static_file('spreadsheet.html',root='')
#route('/upload', method='POST')
def do_upload():
category = request.forms.get('category')
upload = request.files.get('upload')
name, ext = os.path.splitext(upload.filename)
if ext not in ('.csv'):
return 'File extension not allowed.'
upload.save('uploads/' + upload.filename)
fileList = []
with open('spreadsheetList','r') as f:
while True:
line = f.readline()
if not line: break
print(line.strip())
print("line should have just printed")
fileList.append(line.strip())
f.close()
lengthFileList = len(fileList)
x = 0
while x < lengthFileList:
with open(('uploads/' + fileList[x]),'r') as spread:
while True:
line = spread.readline()
if not line: break
institutionDetails = line.split(',')
institutionList.append(institutionDetails)
spread.close()
x = x + 1
spreadsheetName = upload.filename
f = open('spreadsheetList','a')
f.write(spreadsheetName + '\n')
f.close()
with open('uploads/' + spreadsheetName, 'r') as f:
while True:
line = f.readline()
if not line: break
institutionDetails = line.split(',')
institutionList.append(institutionDetails)
print(institutionList)
f.close()
lengthOfList = len(institutionList)
x = 0
coords = []
while x < lengthOfList:
address = urllib.parse.quote_plus(institutionList[x][1])
response = urllib.request.urlopen('http://localhost:80/geoCodeRequest/' + address).read().decode('utf-8')
cleanResponse = str(response).replace('"','')
coords = cleanResponse
print(cleanResponse)
institutionList[x].append(coords)
x = x + 1
print("http sources successfully accessed")
print(institutionList)
x = 0
while x < lengthOfList:
try:
map_osm.circle_marker(location=institutionList[x][3], radius=(int(institutionList[x][2]) * 10),popup=institutionList[x][0], line_color='#3186cc',fill_color='#3186cc', fill_opacity=0.2)
print("marker added")
except:
print("marker could not be added")
x = x + 1
map_osm.create_map(path='osm.html')
return '<meta http-equiv="refresh" content="0; url=osm.html">'
#route('/osm.html')
def send_static():
return static_file('osm.html',root='')
run(host='localhost', port=8080)
A batch file is used to start the second module:
#echo off
python geocodeProxyBypass.py
Here is the second module of code, the geocoding module:
from bottle import route, run, template
import string
location = []
x = 0
#route('/geoCodeRequest/<name>')
def redir(name):
x = 0
print(name)
print(name[:4])
print(name[:3])
with open('ukPostcode.csv','r') as f:
while True:
line = f.readline()
print(line)
if not line: break
locationDetails = line.split(',')
location.append(locationDetails)
print(location[x][0])
if location[x][0] == ('"' + name[:4] + '"'):
coords = location[x][3] + ", " + location[x][4]
return coords
elif location[x][0] == ('"' + name[:3] + '"'):
coords = location[x][3] + ", " + location[x][4]
return ((coords.replace('"','')))
else:
print("no match found for " + name)
x = x + 1
f.close()
run(host='localhost', port=80)
Here is what an example pin generated by the program should look like:
var circle_1 = L.circle([51.74, -1.25
], 7460, {
color: '#3186cc',
fillColor: '#3186cc',
fillOpacity: 0.2
});
circle_1.bindPopup("University of Oxford");
circle_1._popup.options.maxWidth = 300;
map.addLayer(circle_1)
Here is what is actually being output:
var circle_1 = L.circle([5, 1
], 7460, {
color: '#3186cc',
fillColor: '#3186cc',
fillOpacity: 0.2
});
circle_1.bindPopup("University of Oxford");
circle_1._popup.options.maxWidth = 300;
map.addLayer(circle_1)
Apologies for the very long question, please help!