Remove duplicate lines from threading - python

I have a program that reads lines randomly from a file, and uses threading. The problem is that whenever it reads the lines from a file, it sometimes reads a duplicate line from the file. For instance, let's say I use 5 threads and my file looks like this:
line1
line2
line3
line4
line5
The program uses threading to read the lines randomly, but sometimes it can read line4, line3, line5, line2, line5 (again). So my question is how would I get rid of the line5 being a duplicate?
Code:
def get_token():
tokens = []
with open('pokens.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
tokens.append(line.replace('\n', ''))
return tokens
def get_proxy():
proxies = []
with open('proxies.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
proxies.append(line.replace('\n', ''))
return proxies
class Gen:
def __init__(self, token, proxy=None):
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
proxy_ip_port = proxy
proxy2 = Proxy()
proxy2.proxy_type = ProxyType.MANUAL
proxy2.http_proxy = proxy_ip_port
proxy2.ssl_proxy = proxy_ip_port
capabilities = webdriver.DesiredCapabilities.CHROME
proxy2.add_to_capabilities(capabilities)
self.browser = webdriver.Chrome("chromedriver.exe")
self.token = token
self.proxy = proxy
self.password = 'passwordhere'
def register(self):
print('hi')
# Code continues with no duplicates
def worker(proxy=None):
token_list = get_token()
token = random.choice(token_list)
d = Gen(token, proxy=proxy)
d.register()
def main():
threads = []
num_thread = input('Number of Threads: ')
num_thread = int(num_thread)
proxies = get_proxy()
for i in range(num_thread):
t = threading.Thread(target=worker, args= (random.choice(proxies), ))
threads.append(t)
t.start()
if __name__ == '__main__':
main()

Below is a simplified "toy version" of your program that I updated to do the following:
Read the tokens-file from the main thread, into a list
Randomly shuffle the order of the list
Give each worker a roughly-equally-sized subset of the tokens-list for it to choose from
Each worker merely prints out the data that it was given by the main thread (actually doing anything with the data is omitted, for clarity)
This approach avoid duplicates because any given token appears in the list only once, and each thread has been given a different subset of the list to choose tokens from.
import threading
import random
def read_tokens_list():
tokens = []
with open('pokens.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
tokens.append(line.replace('\n', ''))
return tokens
def read_proxies_list():
proxies = []
with open('proxies.txt', 'r', encoding='UTF-8') as file:
lines = file.readlines()
for line in lines:
proxies.append(line.replace('\n', ''))
return proxies
def worker(proxy,token_list):
token = random.choice(token_list)
print("Worker: my proxy is [%s], my token list is %s, I've chosen [%s] as my token" % (proxy, token_list, token))
def main():
threads = []
num_thread = input('Number of Threads: ')
num_thread = int(num_thread)
proxies = read_proxies_list()
token_list = read_tokens_list() # read in the pokens.txt file
random.shuffle(token_list) # shuffle the list into random order
tokens_per_worker = len(token_list) // num_thread # how many tokens from the list each worker will get (roughly)
for i in range(num_thread):
if ((i+1)<num_thread):
num_tokens_for_this_worker = tokens_per_worker # give each worker an even share of the list
else:
num_tokens_for_this_worker = len(token_list) # except the last worker gets whatever is left
# we'll give the first (num_tokens_for_this_worker) tokens in the list to this worker
tokens_for_this_worker = token_list[0:num_tokens_for_this_worker]
# and remove those tokens from the list so that they won't get used by anyone else
token_list = token_list[num_tokens_for_this_worker:]
t = threading.Thread(target=worker, args=(random.choice(proxies), tokens_for_this_worker, ))
threads.append(t)
t.start()
for t in threads:
t.join()
if __name__ == '__main__':
main()

Related

Python pass arguments from a .csv file to threads

the csv:
email,password
test0#test.com,SuperSecretpassword123!
test1#test.com,SuperSecretpassword123!
test2#test.com,SuperSecretpassword123!
test3#test.com,SuperSecretpassword123!
the example printing function
def start_printer(row):
email = row["email"]
password = row["password"]
print(f"{email}:{password}")
the threading starting example
number_of_threads = 10
for _ in range(number_of_threads):
t = Thread(target=start_printer, args=(row,))
time.sleep(0.1)
t.start()
threads.append(t)
for t in threads:
t.join()
how do I pass the values from the csv to the threading example?
I guess you could go about it this way:
from threading import Thread
from csv import DictReader
def returnPartionedList(inputlist: list, x: int = 100) -> list: # returns inputlist split into x parts, default is 100
return([inputlist[i:i + x] for i in range(0, len(inputlist), x)])
def start_printer(row) -> None:
email: str = row["email"]
password: str = row["password"]
print(f"{email}:{password}")
def main() -> None:
path: str = r"tasks.csv"
list_tasks: list = []
with open(path) as csv_file:
csv_reader: DictReader = DictReader(csv_file, delimiter=',')
for row in csv_reader:
list_tasks.append(row)
list_tasks_partitions: list = returnPartionedList(list_tasks, 10) # Run 10 threads per partition
for partition in list_tasks_partitions:
threads: list = [Thread(target=start_printer, args=(row,)) for row in partition]
for t in threads:
t.start()
t.join()
if __name__ == "__main__":
main()
Result:
test0#test.com:SuperSecretpassword123!
test1#test.com:SuperSecretpassword123!
test2#test.com:SuperSecretpassword123!
test3#test.com:SuperSecretpassword123!

write data to JSON file during multiprocessing using python

I am new to python. I am writing a python program to write to a JSON file if the website is unreachable. The multiple websites will be stored in hosts variable. It will be scheduled to check every 5 seconds. I have used pool from multiprocessing to process the website at the same time without delay. After that, i will write the data to the json file. But in here, it is writing only one website data to json file. So how to make this to write two data at the same time.
Here's the sample code:
import os
from multiprocessing import Pool
from datetime import datetime
import time
import json
hosts = ["www.google.com","www.smackcoders.com"]
n = len(hosts)
def write(hosts):
u = "down"
name = "stack.json"
if not os.path.exists(name):
with open(name, 'w') as f:
f.write('{}')
result = [(timestamp, {'monitor.status': u,
"monitor.id": "tcp-tcp#"+hosts
})]
with open(name, 'rb+') as f:
f.seek(-1, os.SEEK_END)
f.truncate()
for entry in result:
_entry = '"{}":{},\n'.format(entry[0], json.dumps(entry[1]))
_entry = _entry.encode()
f.write(_entry)
f.write('}'.encode('ascii'))
def main(hosts):
p = Pool(processes= n)
result = p.map(write, hosts)
while True:
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
main(hosts)
time.sleep(5)
My output:
""March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.smackcoders.com", "monitor.status": "down"},
}
Required Output:
{"March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.smackcoders.com", "monitor.status": "down"},"March 13 2019, 10:49:03":{"monitor.id": "tcp-tcp#www.google.com", "monitor.status": "down"},
}
Ive made some minor changes to your code and implemented a Lock.
import os
from multiprocessing import Pool,RLock
from datetime import datetime
import time
import json
file_lock=RLock()
hosts = ["www.google.com","www.smackcoders.com"]
n = len(hosts)
def write(hosts):
u = "down"
name = "stack.json"
if not os.path.exists(name):
with open(name, 'w') as f:
f.write('{}')
result = [(timestamp, {'monitor.status': u,
"monitor.id": "tcp-tcp#"+hosts
})]
with file_lock:
with open(name, 'rb+') as f:
f.seek(-1, os.SEEK_END)
f.truncate()
for entry in result:
_entry = '"{}":{},\n'.format(entry[0], json.dumps(entry[1]))
_entry = _entry.encode()
f.write(_entry)
f.write('}'.encode('ascii'))
def main(hosts):
p = Pool(processes= n)
result = p.map(write, hosts)
while True:
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
main(hosts)
time.sleep(5)
However, for a long running process that constantly has to read and write a file for logging seems like a poor implementation as the code will have to read a bulky file and completely rewrite it on every process. Consider writing the log in a database instead.
Here's a different option that will use Thread over Pool.
Created a class to get the return of join()
# Class that overwrite Thread to get the return of join()
class ThreadWithReturnValue(Thread):
def __init__(self, group=None, target=None, name=None, args=None, kwargs=None, Verbose=None):
if args is None:
args = ()
if kwargs is None:
kwargs = {}
super().__init__(group, target, name, args, kwargs)
self._return = None
def run(self):
print(type(self._target))
if self._target is not None:
self._return = self._target(*self._args, **self._kwargs)
def join(self, *args):
Thread.join(self, *args)
return self._return
I have changed the code to get the status of each hosts first, then writing the result to your file. Also fixed the way the JSON file is written.
import os
from datetime import datetime
import time
import json
from threading import Thread
hosts = ["www.google.com","www.smackcoders.com"]
filepath = os.path.join(os.getcwd(), "stack.json")
n = len(hosts)
def perform_ping(host_ip):
"""
You have hardcoded down, this method will ping to check if we get an ICMP response
"""
response = os.system("ping -c 1 " + host_ip)
if response == 0:
return 'UP'
else:
return 'DOWN'
def write_result(timestamp, results):
# u = "down" Using perform_ping to get the status
if not os.path.exists(filepath):
current_file = {}
else:
# If file exist, reading the current output
with open(filepath, 'r') as f_read:
current_file = json.loads(f_read.read())
inner_result = []
for result in results:
host, status = result
inner_result.append({'monitor.status': status,
"monitor.id": "tcp-tcp#"+host
})
current_file[timestamp] = inner_result
# writing the file with new input
with open(filepath, 'w') as f_write:
f_write.write(json.dumps(current_file))
def main():
while True:
thread_list = []
for host_ip in hosts:
thread_list.append(ThreadWithReturnValue(target=perform_ping, name=host_ip, args=(host_ip, )))
results = []
timestamp = datetime.now().strftime("%B %d %Y, %H:%M:%S")
for thread in thread_list:
thread.start()
for thread in thread_list:
results.append((thread.name, thread.join()))
# Ping is done in parallel, writing the result at the end to avoid thread collision and reading/writing the file to many times if you increase the number of host
write_result(timestamp, results)
time.sleep(5)
if __name__ == '__main__':
main()

python for loop in parallel

I am trying to read data from an input file, and for each line perform a task in a while loop. Problem is that when I create the first process - its loop is executing and not returning control to the above for loop. Bottom line there is no parallelism. What am I doing wrong?
Here is the relevant code:
from multiprocessing import Process
def work_line(list1Line,jobId):
while True:
print list1Line
tenant = list1Line[0]
module = list1Line[1]
endTime = int(time.time())
startTime = endTime - startTimeDelta
generate(jobId, startTime, endTime, tenantServiceAddress, tenant, module)
print ("tenant {} will sleep for {} seconds").format(tenant,sleepBetweenLoops)
time.sleep(sleepBetweenLoops)
def openFiles():
file = open(CLOUD_INPUT_FILE, 'r')
lines = file.readlines()
file.close()
linesLen = len(lines)
processes = []
for linesIndex in range(0, linesLen):
jobId = GenerateRandomID()
line = lines[linesIndex]
list1Line = line.split()
p = Process(target=work_line(list1Line,jobId))
p.start()
processes.append(p)
print processes
for p in processes:
p.join()
if __name__ == '__main__':
CLOUD_INPUT_FILE = r'C:\CF\input_file.txt'
tenantServiceAddress = 'address.address'
startTimeDelta = 300
sleepBetweenLoops = 1800
print multiprocessing.cpu_count()
openFiles()
You are actually calling the function. Change to
p = Process(target=work_line, args=(list1Line,jobId))

Strange output for threaded subprocesses

thanks for taking the time to look at this, any help with this would be most appreciated!
I am trying to get some network stats by pinging a list of IP address' called ips. The problem i'm having however is that my output is a list containing several 'None's'. Before implementing threads to run the sub-process commands, the output shown below was a series of numbers. If anybody can take a look at my source code and shed some light on the issue i'd be very grateful!
Thank you in advance!
import subprocess
import re
import threading
from multiprocessing import Pool, Lock
from multiprocessing.dummy import Pool as ThreadPool
def get_ips():
# Fill empty list with IP address
ips = []
with open('C:\Python26\ARPips.prn','r')as f:
for line in f:
line = line[:-1]
if line != "end":
ips.append(line)
return ips
def ping(pingArgs):
lock = Lock()
lock.acquire()
# Ping with "pingArgs" as the arguments
ping = subprocess.Popen(pingArgs,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE,
shell=True)
# Get and parse output
out = ping.communicate()
out = ''.join((out))
lost = re.findall(r"Lost = (\d+)", out)
minimum = re.findall(r"Minimum = (\d+)", out)
maximum = re.findall(r"Maximum = (\d+)", out)
avg = re.findall(r"Average = (\d+)", out)
no = re.findall(r"Sent = (\d+)", out)
# Change output to integers
lost = [int(x) for x in lost]
minimum = [int(x) for x in minimum]
maximum = [int(x) for x in maximum]
avg = [int(x) for x in avg]
no = [int(x) for x in no]
print "%s \t \t %s \t \t%s \t \t %s \t \t%s" % (no, lost, maximum, minimum, avg)
lock.release()
def main():
# grab IP address list
ips = get_ips()
# Declare global variables
global position, newIP, pingArgs
position = 0
newIP = ips[position]
position += 1
pingArgs = ["ping", "-n", "1", "-l", "1", "-w", "100", newIP]
# Header for output
print "Packets \t loss(%) \t Max(ms) \t Min(ms) \t Average(ms)"
# Instantiate Pool objects, and set size of pool
pool = Pool(processes=12)
#Ping ips in own threads and return the results
result = pool.map(ping, ips)
# Close the pool and wait for work to finish
pool.close()
pool.join()
# print the results
print result
if __name__ == '__main__':
main()
print get_ips()
The output is shown here below:
Packets loss(%) Max(ms) Min(ms) Average(ms)
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[] [] [] [] []
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
['10.10.10.1', '10.10.10.41', '10.10.10.42', '10.10.10.43', '10.10.10.49', '10.10.10.51', '10.10.10.61', '10.10.10.71', '10.10.10.91', '10.10.10.92', '10.10.10.201', '10.10.10.205', '10.10.10.208', '10.10.10.209', '10.10.10.213', '10.10.10.214']
Process finished with exit code 0
The problem i'm having however is that my output is a list containing several 'None's'
The Nones are produced by print result where result = pool.map(ping, ips) because ping() function doesn't return anything (it means it returns None in Python).
Before implementing threads to run the sub-process commands, the output shown below was a series of numbers.
You are passing ip addresses to Popen() instead of the full ping command to run.
You are defining global pingArgs that is not used anywhere (local pingArgs in ping() overshadows it). You could replace pool.map by builtin map call that creates the result list in the same process and in the same thread, to see that the result is the same (regexes do not match) and the error is not related to threads/processes.
Local lock = Lock() is useless.
Here's the code where the above issues are fixed:
#!/usr/bin/env python
import re
from multiprocessing.dummy import Pool # use threads
from subprocess import Popen, PIPE
def get_ips(filename):
ips = []
with open(filename) as f:
for line in f:
line = line.strip()
if line and line != "end":
ips.append(line)
return ips
def ping(ip):
cmd = ["ping", "-n", "1", "-l", "1", "-w", "100", ip]
ping = Popen(cmd, stdout=PIPE, stderr=PIPE)
# Get and parse output
output, err = ping.communicate()
out = ''.join([output, err])
lost = re.findall(r"Lost = (\d+)", out)
minimum = re.findall(r"Minimum = (\d+)", out)
maximum = re.findall(r"Maximum = (\d+)", out)
avg = re.findall(r"Average = (\d+)", out)
no = re.findall(r"Sent = (\d+)", out)
# Change output to integers
lost = [int(x) for x in lost]
minimum = [int(x) for x in minimum]
maximum = [int(x) for x in maximum]
avg = [int(x) for x in avg]
no = [int(x) for x in no]
return "%s \t \t %s \t \t%s \t \t %s \t \t%s" % (
no, lost, maximum, minimum, avg)
def main():
# grab IP address list
ips = get_ips(r'C:\Python26\ARPips.prn')
# Header for output
print "Packets \t loss(%) \t Max(ms) \t Min(ms) \t Average(ms)"
# Instantiate Pool objects, and set size of pool
pool = Pool(processes=12)
#Ping ips in own threads and return the results
results = pool.map(ping, ips)
# Close the pool and wait for work to finish
pool.close()
pool.join()
# print the results
print "\n".join(results)
if __name__ == '__main__':
main()
This is what got it working for me (unix) system
import subprocess
import re
from multiprocessing import Pool, Lock
lock = Lock()
pingArgs = "ping -n -l 1 -w 100 "
def get_ips():
# Fill empty list with IP address
ips = []
with open('C:\\Python26\\ARPips.prn', 'r')as f:
for line in f:
line = line[:-1]
if line != "end":
ips.append(line)
return ips
def ping(ip):
global lock
pingCommand = pingArgs + ip
lock.acquire()
# Ping with "pingArgs" as the arguments
ping = subprocess.Popen(pingCommand,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE,
shell=True)
# Get and parse output
out = ping.communicate()
out = ''.join((out))
lost = re.findall(r"Lost = (\d+)", out)
minimum = re.findall(r"Minimum = (\d+)", out)
maximum = re.findall(r"Maximum = (\d+)", out)
avg = re.findall(r"Average = (\d+)", out)
no = re.findall(r"Sent = (\d+)", out)
# Change output to integers
lost = [int(x) for x in lost]
minimum = [int(x) for x in minimum]
maximum = [int(x) for x in maximum]
avg = [int(x) for x in avg]
no = [int(x) for x in no]
lock.release()
return "%s \t \t %s \t \t%s \t \t %s \t \t%s" % (no, lost, maximum, minimum, avg)
def main():
# grab IP address list
ips = get_ips()
# Header for output
print("Packets \t loss(%) \t Max(ms) \t Min(ms) \t Average(ms)")
# Instantiate Pool objects, and set size of pool
pool = Pool(processes=12)
#Ping ips in own threads and return the results
results = pool.map(ping, ips)
# Close the pool and wait for work to finish
pool.close()
pool.join()
# print the results
for result in results:
print(result)
if __name__ == '__main__':
main()
print(get_ips())
The issues I was experiencing with the code were incorrect ping arguments (-n didn't take any additional parameters), lock should be global, when shell=True it didn't take a list, only a string, and ping(pingArgs) could overwrite the global argument, whereas you want the pool to send only the ip to the worker and then the worker should add it as the final argument.
also I noticed you didn't escape your path to the file on dist, I didn't use/test that part but included it escaped properly as well for reference.
Edit: Also changed it so each function would simply return the value to be printed and print all of them when completed. Instead of having them being printed inside the function as well as the results of the functions without returns (the list of Nones).

Using python how to execute this code concurrently for multiple files?

I want to tail multiple files concurrently and push the logs to scribe.
I am reading the files from a Config file then I want to tail each file and send the logs to scribe.
What I have tried is sends log for only the first file and doesn't for the others.
I want to run the tailing concurrently for each file and send the logs for each one of them at same time.
for l in Config.items('files'):
print l[0]
print l[1]
filename = l[1]
file = open(filename,'r')
st_results = os.stat(l[1])
st_size = st_results[6]
file.seek(st_size)
while 1:
where = file.tell()
line = file.readline()
if not line:
time.sleep(1)
file.seek(where)
else:
print line, # already has newline
category=l[0]
message=line
log_entry = scribe.LogEntry(category, message)
socket = TSocket.TSocket(host='localhost', port=1463)
transport = TTransport.TFramedTransport(socket)
protocol = TBinaryProtocol.TBinaryProtocol(trans=transport, strictRead=False, strictWrite=False)
client = scribe.Client(iprot=protocol, oprot=protocol)
transport.open()
result = client.Log(messages=[log_entry])
transport.close()
Try something like this (Inspired by this)
import threading
def monitor_file(l):
print l[0]
print l[1]
filename = l[1]
file = open(filename,'r')
st_results = os.stat(l[1])
st_size = st_results[6]
file.seek(st_size)
while 1:
where = file.tell()
line = file.readline()
if not line:
time.sleep(1)
file.seek(where)
else:
print line, # already has newline
category=l[0]
message=line
log_entry = scribe.LogEntry(category, message)
socket = TSocket.TSocket(host='localhost', port=1463)
transport = TTransport.TFramedTransport(socket)
protocol = TBinaryProtocol.TBinaryProtocol(trans=transport, strictRead=False, strictWrite=False)
client = scribe.Client(iprot=protocol, oprot=protocol)
transport.open()
result = client.Log(messages=[log_entry])
transport.close()
for l in Config.items('files'):
thread = threading.Thread(target=monitor_file, args=(l))
A different implementation of #Pengman's idea:
#!/usr/bin/env python
import os
import time
from threading import Thread
def follow(filename):
with open(filename) as file:
file.seek(0, os.SEEK_END) # goto EOF
while True:
for line in iter(file.readline, ''):
yield line
time.sleep(1)
def logtail(category, filename):
print category
print filename
for line in follow(filename):
print line,
log_entry(category, line)
for args in Config.items('files'):
Thread(target=logtail, args=args).start()
Where log_entry() is a copy of the code from the question:
def log_entry(category, message):
entry = scribe.LogEntry(category, message)
socket = TSocket.TSocket(host='localhost', port=1463)
transport = TTransport.TFramedTransport(socket)
protocol = TBinaryProtocol.TBinaryProtocol(trans=transport,strictRead=False,
strictWrite=False)
client = scribe.Client(iprot=protocol, oprot=protocol)
transport.open()
result = client.Log(messages=[entry])
transport.close()
follow() could be implemented using FS monitoring tools, see tail -f in python with no time.sleep.

Categories

Resources