How to change smiles to images(png) using RDKit in python script - python

How to save image files after generate image file in python script?
def mols_to_pngs(mols, basename = "test"):
filenames = []
for i, mol in enumerate(mols):
filename = "%s%d.png" % (basename, i)
Draw.MolToFile(mol,filename)
filenames.append(filename)
return filenames
and I want to this process automatically using csv file and python script

To automatically save image files generated from a python script using a csv file, you need to read the data from the csv file and process it in your script. Here's a basic example:
import csv
import sys
def mols_to_pngs(mols, basename = "test"):
filenames = []
for i, mol in enumerate(mols):
filename = "%s%d.png" % (basename, i)
Draw.MolToFile(mol,filename)
filenames.append(filename)
return filenames
# Read the csv file
try:
with open('data.csv', 'r') as file:
reader = csv.reader(file)
# Skip the header row
next(reader)
mols = []
for row in reader:
# Process the data from the csv file
mol = process_data(row)
mols.append(mol)
except IOError:
print("Could not read file:", file)
sys.exit()
# Save the image files
filenames = mols_to_pngs(mols)
print("Saved the following files:", filenames)
Note that you need to replace the process_data function with your own implementation to process the data from the csv file and create the mol objects. Additionally, you may need to modify the code to match your specific use case.

Related

Converting a list of txt files into a list of csv files with python

I have the rolling code to convert a single .txt file into a a single .csv file, but I need the code to iterate over a directory of .txt files and gives out a directory of the same .txt files but in .csv format.
import csv
textfile = 'X:/general/DavidOrgEcon/GSTT/text to csv/Group.txt'
outfile = 'X:/general/DavidOrgEcon/GSTT/text to csv/Group.csv'
with open(textfile, 'r') as csvfile:
In_text = csv.reader(csvfile, delimiter=':')
all_rows = []
row_dict = {}
count_row = 1
for row in In_text:
if len(row) > 0:
row_dict[row[0].strip()] = row[1].strip()
if count_row % 4 == 0:
all_rows.append(row_dict)
row_dict = {}
count_row += 1
print(all_rows)
keys = all_rows[0].keys()
print(keys)
with open(outfile, 'w', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(all_rows)
So assuming you have your existing function
def text_to_csv(infilepath, outfilepath):
...
which can read a text file from infilepath and output the csv to outfilepath, then you can make a new function that takes two directories and calls it on every text file in the first:
import os
def convert_directory(in_dir, out_dir):
# Loop through every file in the directory
for filename in os.listdir(in_dir):
# Split the file name into a base portion and an extension
# e.g. "file.txt" -> ("file", ".txt")
base_name, extension = os.path.splitext(filename)
# If it is a text file, do the transformation
if extension == ".txt":
# Construct the name of the csv file to create
csv_filename = f"{base_name}.csv"
# Call your function with the full filepaths
text_to_csv(
os.path.join(in_dir, filename),
os.path.join(out_dir, csv_filename)
)
convert_directory(
"X:/general/DavidOrgEcon/GSTT/text to csv/input_dir",
"X:/general/DavidOrgEcon/GSTT/text to csv/output_dir",
)

Python: Convert all files in directory into one .TXT?

I have been trying to convert a number of DOCX files into TXT.
It works for a single file using the code below:
import docx
def getText(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
if __name__ == '__main__':
filename='/content/drive/My Drive/path/file.DOCX'; #file name
fullText=getText(filename)
print (fullText)
file = open("copy.txt", "w")
file.write(fullText)
file.close()
I tried different options (i.e. glob) but did not manage get it to do the above operation on all files in a folder.
Ideally the output should be 1 large text file and not separate ones.
I will need to do some formatting and assigning of IDs in that file in a next step.
Thank you for your help!
corp-alt
With file = open("copy.txt", "w") you open the file and replace its content with write().
With file = open("copy.txt", "a") you append to the existing file with write(). Or maybe even better:
With file = open("copy.txt", "a+") you append to an existing file with write(), or create a new file if it doesn't exist yet.
To go through all files in a folder you can loop over them:
import os
import docx
def getText(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
return '\n'.join(fullText)
if __name__ == '__main__':
foldername='/content/drive/My Drive/path/'; #folder name
all_files = os.listdir(foldername) #get all filenames
docx_files = [ filename for filename in all_files if filename.endswith('.docx') ] #get .docx filenames
file = open("copy.txt", "a+")
for docx_file in docx_files: #loop over .docx files
fullText=getText(filename)
file.write(fullText)
file.close()

Is there a way to load data from all files in a directory using Python?

My question: Is there a way to load data from all files in a directory using Python
Input: Get all files in a given directory of mine (wow.txt, testting.txt,etc.)
Process: I want to run all the files through a def function
Output: I want the output to be all the files names and their respective content below it.For example:
/home/file/wow.txt
"all of its content"
/home/file/www.txt
"all of its content"
Here is my code:
# Import Functions
import os
import sys
# Define the file path
path="/home/my_files"
file_name="wow.txt"
#Load Data Function
def load_data(path,file_name):
"""
Input : path and file_name
Purpose: loading text file
Output : list of paragraphs/documents and
title(initial 100 words considered as title of document)
"""
documents_list = []
titles=[]
with open( os.path.join(path, file_name) ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
#Output
load_data(path,file_name)
Here is my output:
My Problem is that my output only takes one file and shows its content. Obviously, i defined the path and file name in my code to one file but I am confused as to how to write the path in a way to load all the files and output each of its contents separately. Any suggestions?
Using glob:
import glob
files = glob.glob("*.txt") # get all the .txt files
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Using os.listdir():
import os
arr = os.listdir()
files = [x for x in arr if x.endswith('.txt')]
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Try this:
import glob
for file in glob.glob("test/*.xyz"):
print(file)
if my directory name was "test" and I had lots of xyz files in them...
You can use glob and pandas
import pandas as pd
import glob
path = r'some_directory' # use your path
all_files = glob.glob(path + "/*.txt")
li = []
for filename in all_files:
#read file here
# if you decide to use pandas you might need to use the 'sep' paramaeter as well
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
# get it all together
frame = pd.concat(li, axis=0, ignore_index=True)
I will take advantage of the function you have already written, so use the following:
data = []
path="/home/my_files"
dirs = os.listdir( path )
for file in dirs:
data.append(load_data(path, file))
In this case you will have all data in the list data.
Hi you can use a for loop on a listdir:
os.listdir(<path of your directory>)
this gives you the list of files in your directory, but this gives you also the name of folders in that directory
Try generating a file list first, then passing that to a modified version of your function.
def dir_recursive(dirName):
import os
import re
fileList = list()
for (dir, _, files) in os.walk(dirName):
for f in files:
path = os.path.join(dir, f)
if os.path.exists(path):
fileList.append(path)
fList = list()
prog = re.compile('.txt$')
for k in range(len(fileList)):
binMatch = prog.search(fileList[k])
if binMatch:
fList.append(binMatch.string)
return fList
def load_data2(file_list):
documents_list = []
titles=[]
for file_path in file_list:
with open( file_path ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
# Generate a file list & load the data from it
file_list = dir_recursive(path)
documents_list, titles = load_data2(file_list)

How to Read CSV, Create a QR Code, and Write its Filename to New Column?

I'm writing a Python script to generate a QR code from the first column in a csv (concatenated with a local name), and that part works well. The csv just has three columns and looks like this:
ID First Last
144 Jerry Seinfeld
491 George Costanza
104 Elaine Benes
99 Cosmo Kramer
And I use my Python script to take that file, append a prefix to the IDs (in this case, 'NBC') and then create QR codes for each record in a new folder. It's a little long but all of this seems to work fine also:
import csv
import qrcode
import os
import shutil
import time
import inquirer
#Identify Timestamp
timestr = time.strftime("%Y%m%d-%H%M%S")
local = 'NBC'
#Load csv
filename = "stackoverflowtest.csv"
#Path to new local folder
localfolder = local
localimagefolder = localfolder+'/image'
localfilefolder = localfolder+'/files'
#Check/create folders based on local
if not os.path.exists(localfolder):
os.makedirs(localfolder)
if not os.path.exists(localimagefolder):
os.makedirs(localimagefolder)
if not os.path.exists(localfilefolder):
os.makedirs(localfilefolder)
#Copy uploaded file to their local's file folder
shutil.copy2(filename, localfilefolder+'/'+local+'-'+timestr+'.csv') # complete target filename given
#Read csv and generate QR code for local+first column of csv
with open(filename, 'rU') as csvfile:
next(csvfile, None) #skip header row
reader = csv.reader(csvfile, delimiter=',', dialect=csv.excel_tab)
for i, row in enumerate(reader):
labeldata = row[0] #Choose first column of data to create QR codes
print labeldata
qr = qrcode.QRCode(
version=1,
error_correction=qrcode.constants.ERROR_CORRECT_L,
box_size=10,
border=4,
)
qr.add_data(local+"-"+labeldata)
qr.make()
img = qr.make_image()
img.save(localimagefolder+"/"+local+"-"+labeldata+".png".format(i)) #Save image
It creates the NBC folder, copies each csv file in one subfolder, and creates the QR codes for each ID (NBC-144,NBC-491,NBC-104,NBC-99) in another.
The part where I'm running into a problem is opening the csv and writing the filepath/filename back to the csv (or a copy of the csv since from what I've read, I likely can't do it to the same one). Is that possible?
The closest I've come with a script that works is appending the local name with the ID and writing that back to a column but I can't seem to figure out how to do the same with a variable, let alone a filepath/filename:
import csv
import os
import sys
filename = 'stackoverflowtest.csv'
newfilename = 'stackoverflowtest2.csv'
local = 'NBC'
with open(filename, 'rU') as f:
reader = csv.reader(f)
with open(newfilename, 'w') as g:
writer = csv.writer(g)
for row in reader:
new_row = row[0:] + ['-'.join([local, row[0]])]
writer.writerow(new_row)
Is it possible to write something like that within my existing script to add a column for the filepath and filename? Everything I try breaks -- especially if I attempt to do it in the same script.
EDIT:
This is my closest attempt that overwrote the existing file
f=open(newfilename,'r+')
w=csv.writer(f)
for path, dirs, files in os.walk(path):
for filename in files:
w.writerow([newfilename])
Also it's still in a separate script.
Since I can't run the code in your question directly, I had to commented-out portions of it in what's below for testing, but think it does everything you wanted in one loop in one script.
import csv
#import qrcode
import os
import shutil
import time
#import inquirer
# Identify Timestamp
timestr = time.strftime("%Y%m%d-%H%M%S")
local = 'NBC'
# Load csv
filename = "stackoverflowtest.csv"
# Path to new local folder
localfolder = local
localimagefolder = os.path.join(localfolder, 'image')
localfilefolder = os.path.join(localfolder, 'files')
# Check/create folders based on local
if not os.path.exists(localfolder):
os.makedirs(localfolder)
if not os.path.exists(localimagefolder):
os.makedirs(localimagefolder)
if not os.path.exists(localfilefolder):
os.makedirs(localfilefolder)
# Copy uploaded file to their local's file folder
target = os.path.join(localfilefolder, local+'-'+timestr+'.csv') # Target filename
#shutil.copy2(filename, target) # Don't need to do this.
# Read csv and generate QR code for local+first column of csv
with open(filename, 'rb') as csvfile, open(target, 'wb') as outfile:
reader = csv.reader(csvfile, delimiter=',', dialect=csv.excel_tab)
writer = csv.writer(outfile, delimiter=',', dialect=csv.excel_tab)
next(reader) # Skip header row.
for row in reader:
id, first, last = row
# qr = qrcode.QRCode(
# version=1,
# error_correction=qrcode.constants.ERROR_CORRECT_L,
# box_size=10,
# border=4,
# )
#
# qr.add_data(local+"-"+id)
# qr.make()
#
# img = qr.make_image()
imagepath = os.path.join(localimagefolder, local+"-"+id+".png")
# img.save(imagepath) # Save image.
print "saving img:", imagepath
writer.writerow(row + [local+'-'+id, imagepath])
Output from sample input data:
144,Jerry,Seinfeld,NBC-144,NBC/image/NBC-144.png
491,George,Costanza,NBC-491,NBC/image/NBC-491.png
104,Elaine,Benes,NBC-104,NBC/image/NBC-104.png
99,Cosmo,Kramer,NBC-99,NBC/image/NBC-99.png

How do I apply my python code to all of the files in a folder at once, and how do I create a new name for each subsequent output file?

The code I am working with takes in a .pdf file, and outputs a .txt file. My question is, how do I create a loop (probably a for loop) which runs the code over and over again on all files in a folder which end in ".pdf"? Furthermore, how do I change the output each time the loop runs so that I can write a new file each time, that has the same name as the input file (ie. 1_pet.pdf > 1_pet.txt, 2_pet.pdf > 2_pet.txt, etc.)
Here is the code so far:
path="2_pet.pdf"
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
The following script solve your problem:
import os
sourcedir = 'pdfdir'
dl = os.listdir('pdfdir')
for f in dl:
fs = f.split(".")
if fs[1] == "pdf":
path_in = os.path.join(dl,f)
content = getPDFContent(path_in)
encoded = content.encode("utf-8")
path_out = os.path.join(dl,fs[0] + ".txt")
text_file = open(path_out, 'w')
text_file.write(encoded)
text_file.close()
Create a function that encapsulates what you want to do to each file.
import os.path
def parse_pdf(filename):
"Parse a pdf into text"
content = getPDFContent(filename)
encoded = content.encode("utf-8")
## split of the pdf extension to add .txt instead.
(root, _) = os.path.splitext(filename)
text_file = open(root + ".txt", "w")
text_file.write(encoded)
text_file.close()
Then apply this function to a list of filenames, like so:
for f in files:
parse_pdf(f)
One way to operate on all PDF files in a directory is to invoke glob.glob() and iterate over the results:
import glob
for path in glob.glob('*.pdf')
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
Another way is to allow the user to specify the files:
import sys
for path in sys.argv[1:]:
...
Then the user runs your script like python foo.py *.pdf.
You could use a recursive function to search the folders and all subfolders for files that end with pdf. Than take those files and then create a text file for it.
It could be something like:
import os
def convert_PDF(path, func):
d = os.path.basename(path)
if os.path.isdir(path):
[convert_PDF(os.path.join(path,x), func) for x in os.listdir(path)]
elif d[-4:] == '.pdf':
funct(path)
# based entirely on your example code
def convert_to_txt(path):
content = getPDFContent(path)
encoded = content.encode("utf-8")
file_path = os.path.dirname(path)
# replace pdf with txt extension
file_name = os.path.basename(path)[:-4]+'.txt'
text_file = open(file_path +'/'+file_name, "w")
text_file.write(encoded)
text_file.close()
convert_PDF('path/to/files', convert_to_txt)
Because the actual operation is changeable, you can replace the function with whatever operation you need to perform (like using a different library, converting to a different type, etc.)

Categories

Resources