Run multiple terminals from python script and execute commands (Ubuntu) - python

What I have is a text file containing all items that need to be deleted from an online app. Every item that needs to be deleted has to be sent 1 at a time. To make deletion process faster, I divide the items in text file in multiple text files and run the script in multiple terminals (~130 for deletion time to be under 30 minutes for ~7000 items).
This is the code of the deletion script:
from fileinput import filename
from WitApiClient import WitApiClient
import os
dirname = os.path.dirname(__file__)
parent_dirname = os.path.dirname(dirname)
token = input("Enter the token")
file_name = os.path.join(parent_dirname, 'data/deletion_pair.txt')
with open(file_name, encoding="utf-8") as file:
templates = [line.strip() for line in file.readlines()]
for template in templates:
entity, keyword = template.split(", ")
print(entity, keyword)
resp = WitApiClient(token).delete_keyword(entity, keyword)
print(resp)
So, I divide the items in deletion_pair.txt and run this script multiple times in new terminals (~130 terminals). Is there a way to automate this process or do in more efficient manner?

I used threading to run multiple functions simultaneously:
from fileinput import filename
from WitApiClient import WitApiClient
import os
from threading import Thread
dirname = os.path.dirname(__file__)
parent_dirname = os.path.dirname(dirname)
token = input("Enter the token")
file_name = os.path.join(parent_dirname, 'data/deletion_pair.txt')
with open(file_name, encoding="utf-8") as file:
templates = [line.strip() for line in file.readlines()]
batch_size = 20
chunks = [templates[i: i + batch_size] for i in range(0, len(templates), batch_size)]
def delete_function(templates, token):
for template in templates:
entity, keyword = template.split(", ")
print(entity, keyword)
resp = WitApiClient(token).delete_keyword(entity, keyword)
print(resp)
for chunk in chunks:
thread = Thread(target=delete_function, args=(chunk, token))
thread.start()
It worked! Any one has any other solution, please post or if the same code can be written more efficiently then please do tell. Thanks.

Related

Is there a way I can use multi-threading or multi-processing in python to connect to 200 different servers and download data from them

I am writing a script inn python to download data from around 200 different servers using multi-threading. My Objective is to fetch data from a table in Database of the server and save the data into a csv file. All the servers have the database and table.
The code I have written is:
import concurrent.futures
import sqlalchemy as db
import urllib
import pandas as pd
def write_to_database(line):
try:
server = line[0]
filename = line[1]
file_format = ".csv"
file = filename + file_format
print(file)
params = urllib.parse.quote_plus(
"DRIVER={SQL Server};SERVER=" + server + ";DATABASE=Database_name;UID=xxxxxxx;PWD=xxxxxxx")
engine = db.create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)
sql_DF = pd.read_sql("SELECT * FROM table_name",
con=engine, chunksize=50000)
sql_DF.to_csv()
except Exception as e:
print(e)
def read_server_names():
print("Reading Server Names")
f = open("servers_data.txt", "r")
contents = f.readlines()
for line in contents:
list.append(line.split(','))
def main():
with concurrent.futures.ProcessPoolExecutor() as executor:
for line in zip(list, executor.map(write_to_database, list)):
print()
if __name__ == '__main__':
list = []
read_server_names()
main()
The problem with this code is the process is taking lot of system memory. Can I get some guidance on a better way to do this task by using either multi-threading or multi-processing? Which will give good performance in terms of using less CPU resources!
I'd suggest using multiprocessing. I've also slightly refactored your reading code to avoid using a global variable.
The write function now prints three status messages; one when it begins reading from a given server, another when it finishes reading (into memory!) and another when it has finished writing to a file.
Concurrency is limited to 10 tasks, and each worker process is recycled after 100. You may want to change those parameters.
imap_unordered is used for slightly faster performance, since the order of tasks doesn't matter here.
If this is still too resource intensive, you will need to do something else than naively use Pandas; instead maybe just use SQLAlchemy to do the same query and write to the CSV file one row at a time.
import multiprocessing
import sqlalchemy as db
import urllib
import pandas as pd
file_format = ".csv"
def write_to_database(line):
try:
server, filename = line
file = filename + file_format
params = urllib.parse.quote_plus(
"DRIVER={SQL Server};"
"SERVER=" + server + ";"
"DATABASE=Database_name;"
"UID=xxxxxxx;"
"PWD=xxxxxxx"
)
print(server, "start")
engine = db.create_engine("mssql+pyodbc:///?odbc_connect=%s" % params)
sql_DF = pd.read_sql("SELECT * FROM table_name", con=engine, chunksize=50000)
print(server, "read", len(sql_DF))
sql_DF.to_csv(file)
print(server, "write", file)
except Exception as e:
print(e)
def read_server_names():
with open("servers_data.txt", "r") as f:
for line in f:
# Will break (on purpose) if there are more than 2 fields
server, filename = f.strip().split(",")
yield (server, filename)
def main():
server_names = list(read_server_names())
# 10 requests (subprocesses) at a time, recycle every 100 servers
with multiprocessing.Pool(processes=10, maxtasksperchild=100) as p:
for i, result in enumerate(p.imap_unordered(write_to_database, server_names), 1):
print("Progress:", i, "/", len(server_names))
pass # do nothing with the result, the function deals with it
if __name__ == "__main__":
main()

Python TypeError: stat: path should be string, bytes, os.PathLike or integer, not list

i am new on python, i wrote simple script to send all image sorted by time in folder to API. this code working on just one file(jpg), and can't send the rest of image in folder. I want to if i run this code, it's just waiting until some image added to current folder, when image is inside folder then it will send to API by time based on images that first existed. I am very confused, any helps will be appriciated! thx
import glob
import argparse
import requests
import json
import time
import os
def main():
result = []
file = glob.glob("/path/to/dir/*.jpg")
regions = ['id']
time_to_wait = 10000
time_counter = 0
while not os.path.exists(file):
time.sleep(1)
time_counter += 1
if time_counter > time_to_wait: break
print("waiting for file...")
if os.path.isfile(file):
with open(file, 'rb') as fp:
response = requests.post(
'https://GET_API/',
data=dict(regions=regions),
files=dict(upload=fp),
headers={'Authorization': 'Token ' + 'XXX'})
result.append(response.json())
resp_dict = json.loads(json.dumps(result, indent=2))
if resp_dict[0]['results']:
num=resp_dict[0]['results'][0]['plate']
print(f"DETECTED NUMBER: {num}")
os.remove(file)
else:
print("file doesn't exists!")
if __name__ == '__main__':
main()
You didn't update your file on each iteration. Maybe that's why no more new files are detected. file also need to be treated as a list, so I guess you should iterate through file. Your while loop should look like this:
while True:
files = glob.glob(os.path.join('path', 'to', 'dir', '*.jpg'))
for file in files:
if os.path.isfile(file):
with open(file, 'rb') as fp:
# Upload and delete
# sleep

Python Script Only Reads 100 First Files

I have a folder with 616 files, but my script only reads the first 100. What settings do I need to change around to get it to read them all? It's probably relevant, I'm using Anaconda Navigator's Jupyter Notebook.
Here's my code:
import re
import string
from collections import Counter
import os
import glob
def word_count(file_tokens):
for word in file_tokens:
count = Counter(file_tokens)
return count
files_list = glob.glob("german/test/*/negative/*")
print(files_list)
for path in files_list:
corpus, tache, classe, file_name = path.split("\\")
file = open(path, mode="r", encoding="utf-8")
read_file = file.read()
##lowercase
file_clean = read_file.lower()
##tokenize
file_tokens = file_clean.split()
##word count and sort
print(word_count(file_tokens))
You are probably hitting some max open files limit in your system. You can either close every file at the end of the loop, or use a context manager in the loop:
with open(path, mode="r", encoding="utf-8") as file:
....
Have you tried printing the length of the files_list variable and check if it is 616 or 100 ?
print(len(files_list))

using supported site for video processing

I am trying to change my code to support video processing from multiple sites (youtube, vimeo, etc.) using the youtube extractions. I don't want to import youtube-dl (unless necessary). I would prefer to call a function. my understanding is that this: youtube-dl http://vimeo.com/channels/YOUR-CHANNEL) is a command line tool. please help!
import pymongo
import get_media
import configparser as ConfigParser
# shorten list to first 10 items
def shorten_list(mylist):
return mylist[:10]
def main():
config = ConfigParser.ConfigParser()
config.read('settings.cfg')
youtubedl_filename = config.get('media', 'youtubedl_input')
print('creating file: %s - to be used as input for youtubedl' % youtubedl_filename)
db = get_media.connect_to_media_db()
items = db.raw
url_list = []
cursor = items.find()
records = dict((record['_id'], record) for record in cursor)
# iterate through records in media items collection
# if 'Url' field exists and starts with youtube, add url to list
for item in records:
item_dict = records[item]
#print(item_dict)
if 'Url' in item_dict['Data']:
url = item_dict['Data']['Url']
if url.startswith('https://www.youtube.com/'):
url_list.append(url)
# for testing purposes
# shorten list to only download a few files at a time
url_list = shorten_list(url_list)
# save list of youtube media file urls
with open(youtubedl_filename, 'w') as f:
for url in url_list:
f.write(url+'\n')
if __name__ == "__main__":
main()

How to implement the 'tempfile' module to this code

This is a small widget that I am designing that is designed to 'browse' while circumventing proxy settings. I have been told on Code Review that it would be beneficial here, but am struggling to put it in with my program's current logic. Here is the code:
import urllib.request
import webbrowser
import os
import tempfile
location = os.path.dirname(os.path.abspath(__file__))
proxy_handler = urllib.request.ProxyHandler(proxies=None)
opener = urllib.request.build_opener(proxy_handler)
def navigate(query):
response = opener.open(query)
html = response.read()
return html
def parse(data):
start = str(data)[2:-1]
lines = start.split('\\n')
return lines
while True:
url = input("Path: ")
raw_data = navigate(url)
content = parse(raw_data)
with open('cache.html', 'w') as f:
f.writelines(content)
webbrowser.open_new_tab(os.path.join(location, 'cache.html'))
Hopefully someone who has worked with these modules before can help me. The reason that I want to use tempfile is that my program gets raw html, parses it and stores it in a file. This file is overwritten every time a new input comes in, and would ideally be deleted when the program stops running. Also, the file doesn't have to exist when the program initializes so it seems logical from that view also.
Since you are passing the name of the file to webbrowser.open_new_tab(), you should use a NamedTemporaryFile
cache = tempfile.NamedTemporaryFile()
...
cache.seek(0)
cache.writelines(bytes(line, 'UTF-8') for line in content)
cache.seek(0)
webbrowser.open_new_tab('file://' + cache.name)

Categories

Resources