I have a folder (Molecules) with many sdf files (M00001.sdf, M00002.sdf and so on) representing different molecules. I also have a csv where each row represents the a molecule (M00001, M00002 etc).
I'm writing a code in order to get files on Molecules folder if their name is a row on the csv file.
First attempt
import os
path_to_files = '/path_to_folder/Molecules' # path to Molecules folder
for files in os.listdir(path_to_files):
names = os.path.splitext(files)[0] # get the basename (molecule name)
with open('molecules.csv') as ligs: # Open the csv file of molecules names
for hits in ligs:
if names == hits:
print names, hits
else:
print 'File is not here'
However this returns nothing on the command line (literally nothing). What is wrong with this code?
I am not sure that this is the best way (I only know that the following code works for my data) but if your molecule.csv has the standard csv format, i.e. "molecule1,molecule2,molecule3 ...", you can try to rearrange your code in this way:
import os
import csv
path_to_files = '/path_to_folder/Molecules' # path to Molecules folder
for files in os.listdir(path_to_files):
names = os.path.basename(files)
names = names.replace(".sdf","")
with open('molecules.csv','r') as ligs:
content = csv.reader(ligs)
for elem in content:
for hits in elem:
if names == hits:
print names, hits
else:
print 'File is not here'
See csv File Reading and Writing for csv module
I solved the problem with a rather brute approach
import os
import csv
import shutil
path_to_files = None # path to Molecules folder
new_path = None # new folder to save files
os.mkdir(new_path) # create the folder to store the molecules
hits = open('molecules.csv', 'r')
ligands = []
for line in hits:
lig = line.rstrip('\n')
ligands.append(lig)
for files in os.listdir(path_to_files):
molecule_name = os.path.splitext(files)[0]
full_name = '/' + molecule_name + '.sdf'
old_file = path_to_files + full_name
new_file = new_path + full_name
if molecule_name in ligands:
shutil.copy(old_file, new_file)
Related
I have a few data files in a directory, and I want to move them to the subdirectories based on their filenames. Let's say we created the first directory named "20220322_170444," and it should contain the first four files only because in the next file the "el" is less than the previous one, so the second folder, let's say is "20220322_170533", then it should contain next eight files until the el becomes less again than the previous name.
example data
files =[
'cfrad.20220322_170444.122_COW1_v2_s02_el3.40_SUR.nc',
'cfrad.20220322_170456.550_COW1_v2_s03_el4.22_SUR.nc',
'cfrad.20220322_170508.975_COW1_v2_s04_el5.09_SUR.nc',
'cfrad.20220322_170521.397_COW1_v2_s05_el5.99_SUR.nc',
'cfrad.20220322_170533.811_COW1_v2_s06_el0.45_SUR.nc',
'cfrad.20220322_170546.228_COW1_v2_s07_el1.20_SUR.nc',
'cfrad.20220322_170558.648_COW1_v2_s08_el1.90_SUR.nc',
'cfrad.20220322_170611.072_COW1_v2_s09_el2.61_SUR.nc',
'cfrad.20220322_170623.503_COW1_v2_s10_el3.40_SUR.nc',
'cfrad.20220322_170635.923_COW1_v2_s11_el4.21_SUR.nc',
'cfrad.20220322_170648.341_COW1_v2_s12_el5.09_SUR.nc',
'cfrad.20220322_170700.765_COW1_v2_s13_el5.99_SUR.nc',
'cfrad.20220322_170713.179_COW1_v2_s14_el0.45_SUR.nc',
'cfrad.20220322_170725.604_COW1_v2_s15_el1.20_SUR.nc',
'cfrad.20220322_170738.030_COW1_v2_s16_el1.90_SUR.nc',
'cfrad.20220322_170750.461_COW1_v2_s17_el2.61_SUR.nc',
'cfrad.20220322_170802.877_COW1_v2_s18_el3.40_SUR.nc',
'cfrad.20220322_170815.301_COW1_v2_s19_el4.22_SUR.nc',
'cfrad.20220322_170827.715_COW1_v2_s20_el8.01_SUR.nc',
'cfrad.20220322_170840.144_COW1_v2_s21_el11.02_SUR.nc']
for file in files:
np.savetxt(fname=file, X=np.array([1,1]))
What I tried is
import numpy as np
from datetime import datetime
import glob, os, re
import shutil
sweeps = []
temp = []
for i, file in enumerate(files[:19]):
match_str = re.search(r'\d{4}\d{2}\d{2}_\d{2}\d{2}\d{2}', file)
res = datetime.strptime(match_str.group(), '%Y%m%d_%H%M%S')
print(res.strftime("%Y%m%d_%H%M%S"))
el_pos = int(files[i].find('el'))
st_pos = files[i][el_pos+1:el_pos+3]
el_pos1 = int(files[i+1].find('el'))
end_pos = files[i+1][el_pos1+1:el_pos1+3]
# print(files[i][s_pos+1:s_pos+3],files[i+1][s_pos1+1:s_pos1+3])
temp.append(files[i])
print("len(files):",len(files),i)
print(st_pos,end_pos)
# print()
if st_pos>end_pos:
print("temp len: ", len(temp))
sweeps.append(temp)
temp = []
elif len(files)-i==2:
print('entered')
sweeps.append(temp)
I now have a list named sweeps, and it contains the desired files; how can I now move these files to the directories,m but the directories should be named as I stated above based on the date. I have also the date string in variable res.strftime("%Y%m%d_%H%M%S") can be used to create directories.
Some string splitting can do this for you.
import shutil
import os
files = [
"cfrad.20220322_170444.122_COW1_v2_s02_el3.40_SUR.nc",
"cfrad.20220322_170456.550_COW1_v2_s03_el4.22_SUR.nc",
"cfrad.20220322_170508.975_COW1_v2_s04_el5.09_SUR.nc",
"cfrad.20220322_170521.397_COW1_v2_s05_el5.99_SUR.nc",
"cfrad.20220322_170533.811_COW1_v2_s06_el0.45_SUR.nc",
"cfrad.20220322_170546.228_COW1_v2_s07_el1.20_SUR.nc",
"cfrad.20220322_170558.648_COW1_v2_s08_el1.90_SUR.nc",
"cfrad.20220322_170611.072_COW1_v2_s09_el2.61_SUR.nc",
"cfrad.20220322_170623.503_COW1_v2_s10_el3.40_SUR.nc",
"cfrad.20220322_170635.923_COW1_v2_s11_el4.21_SUR.nc",
"cfrad.20220322_170648.341_COW1_v2_s12_el5.09_SUR.nc",
"cfrad.20220322_170700.765_COW1_v2_s13_el5.99_SUR.nc",
"cfrad.20220322_170713.179_COW1_v2_s14_el0.45_SUR.nc",
"cfrad.20220322_170725.604_COW1_v2_s15_el1.20_SUR.nc",
"cfrad.20220322_170738.030_COW1_v2_s16_el1.90_SUR.nc",
"cfrad.20220322_170750.461_COW1_v2_s17_el2.61_SUR.nc",
"cfrad.20220322_170802.877_COW1_v2_s18_el3.40_SUR.nc",
"cfrad.20220322_170815.301_COW1_v2_s19_el4.22_SUR.nc",
"cfrad.20220322_170827.715_COW1_v2_s20_el8.01_SUR.nc",
"cfrad.20220322_170840.144_COW1_v2_s21_el11.02_SUR.nc",
]
for f in files:
with open(f, "w") as of:
of.write("\n")
# force the if statement below to be True on first run
el = 99999999
basepath = "."
for f in files:
new_el = int(f.split(".")[2].split("_")[-1].replace("el", ""))
if new_el < el:
# store new dir name
curr_dir = f.split(".")[1]
print(curr_dir)
# create directory
os.makedirs(curr_dir, exist_ok=True)
# store new el
el = new_el
# move file
shutil.move(f"{basepath}{os.sep}{f}", f"{basepath}{os.sep}{curr_dir}{os.sep}{f}")
I´m new to python and I´ve been trying to simplify a manual task that I do on my daily bases, I have a text file with a list of file names separated in groups by a blank line, like this:
fileName1
fileName2
fileName3
fileName4
fileName5
fileName6
fileName7
fileName8
fileName9
fileName10
fileName11
fileName12
All of this files are in one folder and I want to find each group of files and move them into separate folders the name of the new folders should be the name of the first file of each group.
I´m doing my research and I found how to do each step separately using os and shutil modules but I can´t find a way to join them together and make a beautiful script, any help that I can get from you guys will be awesome, thanks!!
Here's a little script that can do that.
I've made two assumptions:
The file with the list of files is stored in the same directory as source files
There is a blank line after the last file so the script can grab the last group
import os
from shutil import move
from itertools import groupby
#Where the files are originally stored
src_path = "C:\\temp\\src\\"
#Where the group folders will go
dest_path = "C:\\temp\\dest\\"
#Open up the file containing the list of files
with open(src_path + "list_of_files.txt") as txt:
lines = txt.readlines() #Read the file
#Split the contents of the file based on the newline character "\n"
i = (list(g) for _, g in groupby(lines, key='\n'.__ne__))
list_of_groups = [a + b for a, b in zip(i, i)]
#Iterate through each group
for group in list_of_groups:
folder_name = dest_path + group[0].replace("\n","") + "\\"
if not os.path.exists(folder_name):
#Create a folder for each group if it doesn't already exist
os.mkdir(folder_name)
#Move over each file in the group. The last element in the group is a newline character
for file in group:
if file != "\n":
move(src_path + file.replace("\n",""),folder_name + file.replace("\n",""))
When reading a file you can look up characters. Blank spaces have a newline character as represented by \n.
import os
filepath1 = os.getcwd() # current working directory
filepath2 = "path\\to\\where\\you\\want\\dir"
filename = os.path.join(filepath1,"yourfilename.txt")
dirName = []
groupName = "filegroup"
idx = 1
newPath = ""
init = True
file = open(filename, "r")
for line in file:
if init == True: # initial folder
newPath = os.path.join(filepath2,groupName + str(idx))
os.mkdir(newPath)
dirName.append(groupName)
init = False
if line == "\n": # line in file is empty
idx += 1
newName = groupName + str(idx)
dirName.append(newName)
newPath = filepath2 + dirName[idx-1]
os.mkdir(newPath)
else:
os.mkdir(os.path.join(newPath,line.rstrip()))
file.close()
My code currently unzips one zip folder and finds the file called file.txt and extracts it. Now I need to unzip multiple folders that have the extension .zip. I have tried to use code similar to what I need it to do but the problem is that now I have to find a file called file.txt in each of those .zip folders and extract that file only . Also to store file.txt into a separate folder that has the same name where it came from. Thank you in advance for your time.
import re
import os
from zipfile import ZipFile
def pain():
print("\t\t\tinput_files.zip has been unzipped")
with ZipFile('input_files.zip', 'r') as zipObj:
zipObj.extractall()
listOfFileNames = zipObj.namelist()
for fileName in listOfFileNames:
if fileName.endswith('.txt'):
zipObj.extract(fileName, 'storage')
outfile = "output2.txt" #this will be the filename that the code will write to
baconFile = open(outfile,"wt")
file_name1 = "file.txt"
print('Filename\tLine\tnumber of numbers\tstring separated by a comma\twhite space found\ttab found\tcarriage return found\n') #This prints the master column in the python shell and this is the way the code should collect the data
baconFile.write('Filename\tLine\tnumber of numbers\tstring separated by a comma\twhite space found\ttab found\tcarriage return found\n') #This prints the master column in the output file and this is the way the code should collect the data
#for filename in os.listdir(os.getcwd() + "/input_files"):
for filename in os.listdir('C:\Users\M29858\Desktop\TestPy\Version10\input_files'):
with open("input_files/" + filename, 'r') as f:
if file_name1 in filename:
output_contents(filename, f, baconFile)
baconFile.close() #closes the for loop that the code is writing to
def output_contents(filename, f, baconFile): #using open() function to open the file inside the directory
index = 0
for line in f:
#create a list of all of the numerical values in our line
content = line.split(',') #this will be used to count the amount numbers before and after comma
whitespace_found = False
tab_found = False
false_string = "False (end of file)"
carriage_found = false_string
sigfigs = ""
index += 1 #adds 1 for every line if it finds what the command wants
if " " in line: #checking for whitespace
whitespace_found = True
if "\t" in line: #checking for tabs return
tab_found = True
if '\n' in line: #checking if there is a newline after the end of each line
carriage_found = True
sigfigs = (','.join(str(len(g)) for g in re.findall(r'\d+\.?(\d+)?', line ))) #counts the sigsfigs after decimal point
print(filename + "\t{0:<4}\t{1:<17}\t{2:<27}\t{3:17}\t{4:9}\t{5:21}"
.format(index, len(content), sigfigs, str(whitespace_found), str(tab_found), str(carriage_found))) #whatever is inside the .format() is the way it the data is stored into
baconFile.write('\n')
baconFile.write( filename + "\t{0:<4}\t{1:<17}\t{2:<27}\t{3:17}\t{4:9}\t{5:21}"
.format(index, len(content), sigfigs, str(whitespace_found), str(tab_found), str(carriage_found)))
if __name__ == '__main__':
pain()
#THIS WORKS
import glob
import os
from zipfile import ZipFile
def main():
for fname in glob.glob("*.zip"): # get all the zip files
with ZipFile(fname) as archive:
# if there's no file.txt, ignore and go on to the next zip file
if 'file.txt' not in archive.namelist(): continue
# make a new directory named after the zip file
dirname = fname.rsplit('.',1)[0]
os.mkdir(dirname)
extract file.txt into the directory you just created
archive.extract('file.txt', path=dirname)
import os, unicodecsv as csv
# open and store the csv file
IDs = {}
with open('labels.csv','rb') as csvfile:
timeReader = csv.reader(csvfile, delimiter = ',')
# build dictionary with associated IDs
for row in timeReader:
IDs[row[0]] = row[1]
# move files
path = 'train/'
tmpPath = 'train2/'
for oldname in os.listdir(path):
# ignore files in path which aren't in the csv file
if oldname in IDs:
try:
os.rename(os.path.join(path, oldname), os.path.join(tmpPath, IDs[oldname]))
except:
print 'File ' + oldname + ' could not be renamed to ' + IDs[oldname] + '!'
I am trying to sort my files according to this csv file. But the file contains many ids with same name. Is there a way to move files with same name to 1 folder or adding a number in front of a file if the file with same name already exist in directory?
Example-
id name
001232131hja1.jpg golden_retreiver
0121221122ld.jpg black_hound
0232113222kl.jpg golden_retreiver
0213113jjdsh.jpg alsetian
05hkhdsk1233a.jpg black_hound
I actually want to move all the files having id corresponding to golden_retreiver to one folder and so on.
Based on what you describe, here is my approach:
import csv
import os
SOURCE_ROOT = 'train'
DEST_ROOT = 'train2'
with open('labels.csv') as infile:
next(infile) # Skip the header row
reader = csv.reader(infile)
seen = set()
for dogid, breed in reader:
# Create a new directory if needed
if breed not in seen:
os.mkdir(os.path.join(DEST_ROOT, breed))
seen.add(breed)
src = os.path.join(SOURCE_ROOT, dogid + '.jpg')
dest = os.path.join(DEST_ROOT, breed, dogid + '.jpg')
try:
os.rename(src, dest)
except WindowsError as e:
print e
Notes
For every line in the data file, I create the breed directory at the destination. I use the set seen to make sure that I only create each directory once.
After that, it is a trivia matter of moving files into place
One possible move error: file does not exist in the source dir. In which case, the code just prints out the error and ignore it.
I need to add columns to a "matched" shape file based on a csv. I have one last step to complete which is to get the value to enter into the shp from the csv.
I get
readCSV[rowID] Traceback (most recent call last): File "", line 1, in TypeError: '_csv.reader'
object is not subscriptable
The stripped down CSV is
The files look like
The code mataches OVL_CAT + OVL2_DESC to the File Name.
I then get the code to add a column called LGA_CODE and need to populate it with '583094' which is row 2, column 1...how do I get this when I can't call FileList2 to get row 2 from the csv (3 in the example below but 2 in python)?
import os, sys, datetime, csv, arcpy, string
from subprocess import Popen
from itertools import islice
top = os.getcwd() # change to a specific path if required.
# This otherwise starts with the directory the script is in (preferred).
RootOutput = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch' # change if you want output somewhere else
top=RootOutput
SourceDIR = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch\OM_011' # source of your data (subdirectories searched as well
outDIR = top+"\\workingFiles" # directory where output is written to. Includes temp files
finalDIR = top+"\\final" # folder for final data only
AOI = 'AOI.csv' # name of the file containing the las file names in the second column
Compare_Column = 2
Compare_Column2 = 3
# END setting base paths
# NOTHING BELOW should need editing.
FileTypes=['shp']
SearchStrings=[]
filecount=0
List =[]
count=0
x=0
os.chdir(top)
#Generate list with unique file name codes from CSV
FileList = csv.reader(open(AOI))
SearchStrings=[]
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rowID=rowID+1
for root, dirs, files in os.walk(SourceDIR, topdown=False):
for fl in files:
currentFile=os.path.join(root, fl)
for FileType in FileTypes:
status= str.endswith(currentFile,FileType)
if str(status) == 'True':
List.append(currentFile)
for SearchString in SearchStrings:
#print currentFile
#print SearchString
if str(SearchString in currentFile) == 'True':
#print str(currentFile)+str(status)
List.append(currentFile)
filecount=filecount+1
#del fl
# Get list of Column Names
headers_count = 1
with open(AOI) as fin:
headers = list(islice(fin, headers_count))
delimiter=','
header=str(headers)
header_list=header.split(delimiter)
# Process matching files
for fl in List:
header_count=0
for header in header_list:
dfStore=fl
#arcpy.AddField_management(dfStore, str(header) ,'TEXT')
# Get RowID to read column data from
filename=fl[fl.rfind('\\')+1:fl.rfind('_')]
for field in SearchStrings:
#print field, filename
if field.endswith(filename):
rowID=field[:field.find('_')]
with open(AOI, 'rb') as f:
readCSV= csv.reader(f)
text=readCSV[rowID][1]
## arcpy.CalculateField_management(fl, header, text,"PYTHON_9.3")
=== UPDATED CODE BASED ON COMMENTS -it's all working find if anyone needs it.
import os, sys, datetime, csv, arcpy, string
from subprocess import Popen
from itertools import islice
top = os.getcwd() # change to a specific path if required.
# This otherwise starts with the directory the script is in (preferred).
RootOutput = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch' # change if you want output somewhere else
top=RootOutput
SourceDIR = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch\OM_011' # source of your data (subdirectories searched as well
outDIR = top+"\\workingFiles" # directory where output is written to. Includes temp files
finalDIR = top+"\\final" # folder for final data only
AOI = 'AOI.csv' # name of the file containing the las file names in the second column
Compare_Column = 3
Compare_Column2 = 4
# END setting base paths
# NOTHING BELOW should need editing.
FileTypes=['shp']
SearchStrings=[]
filecount=0
List =[]
count=0
x=0
os.chdir(top)
#Generate list with unique file name codes from CSV
FileList = csv.reader(open(AOI))
SearchStrings=[]
rows=[]
#FinalList=[]
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rows.append(File)
#FinalList.append()
rowID+=1
for root, dirs, files in os.walk(SourceDIR, topdown=False):
for fl in files:
currentFile=os.path.join(root, fl)
for FileType in FileTypes:
status= str.endswith(currentFile,FileType)
if status:
#List.append(currentFile)
for SearchString in SearchStrings:
#print currentFile, SearchString
if str(SearchString[SearchString.find('_')+1:] in currentFile) == 'True':
#print str(currentFile)+str(status)
List.append(currentFile)
filecount=filecount+1
#del fl
# Get list of Column Names
headers_count = 1
with open(AOI) as fin:
headers = list(islice(fin, headers_count))
delimiter=','
header=str(headers)
header_listT=header.split(delimiter)
header_list=[]
for hdr in header_listT:
header_list.append(arcpy.ValidateTableName(hdr)[:10])
# Process matching files
columnID=1
for fl in List:
header_count=0
for header in header_list:
print header
dfStore=fl
try:
arcpy.AddField_management(dfStore, str(header) ,'TEXT')
except:
pass
# Get RowID to read column data from
filename=fl[fl.rfind('\\')+1:fl.rfind('_')]
for field in SearchStrings:
#print field, filename
#print header, field
if field.endswith(filename):
#print 'FOUND......................'
column_count=len(fl)
if columnID < len(header_list):
rowID=int(field[:field.find('_')])
text = rows[rowID][columnID]
print filename, header, text
columnID+=1
arcpy.CalculateField_management(fl, header, "text" ,"PYTHON_9.3")
#arcpy.CalculateField_management("P:/2012/273_CCRC_Townplanning_Datasets/Working/scratch/OM_011/OM_011_Waterway_Envelopes_ccrc.shp","LGA_CODE","5","PYTHON","#")
Your problem is in these two lines:
readCSV= csv.reader(f)
text=readCSV[rowID][1]
csv.reader is an iterable over the lines of the file; it cannot be directly indexed. You could use islice to get the element you want (islice(readCSV, rowID, rowID+1).next()), though a neater solution would just be to store a dictionary mapping rowID to the AOI row when you read it the first time (in the SearchStrings loop):
FileList = csv.reader(open(AOI))
SearchStrings = []
rows = []
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rows.append(File)
rowID=rowID+1
... # later
rowID=int(field[:field.find('_')])
text = rows[rowID][1]