Is there a way to prevent a backslash ('\\') from reduplicating to '\\\\'? - python

I have a regular structure of archives with the format pathtofile\folder.zip, and I need to access a file within each archive with the format ```folder\image.png`.
The code I am using to achieve this is:
import zipfile
imgname = path[:-4].split('\\')[-1] + '\\image.png'
imgfile = archive.extract(imgname, path=dst)`
Where path = pathtofile\\folder.zip
However this is throwing the error below:
KeyError: "There is no item named 'folder\\\\image.png' in the archive"
I have also tried removing the \\ from the imgname definition, but this throws the following error:
KeyError: "There is no item named 'folderimage.png' in the archive"
so it seems like the backslashes really are reduplicating themselves somehow.
I'm wondering what could be causing the backslashes to multiply; if anyone has any wise words they would be greatly appreciated!
Larger part of the code:
archives = dict()
sub_data = data['file_path'] # data is a pandas dataframe containing file names and data about each file
filenames = get_unique(list(sub_data)) # get_unique is equivalent to list(set(sequence))
for filename in filenames:
zip_path = image_dir+filename+'.zip'
fil = sub_data == filename
frames = [f for f in data.loc[fil, 'local_index']]
archives.update({zip_path: frames})
keys = list(archives.keys())
for i in range(len(filenames)):
key = keys[i]
archive = zipfile.ZipFile(key, 'r')
dst = output_image_dir+'\\'+filenames[i] # output_image_dir is a filepath not ending in '\\'
if not os.path.exists(dst):
os.makedirs(dst)
for frame in archives[key]:
imgname = os.path.join(key[:-4].split('\\')[-1], 'frame'+str(frame).zfill(7)+'.png')
if not os.path.exists(dst+'\\'+imgname):
imgfile = archive.extract(imgname, path=dst)

Related

Python: How do I assign the result of my function to a variable that i can read_csv

The code that I have determines which Operating System is being used. Then it has to search the entire system for my csv file. When it's found I need to be able to read in the csv file (so that its not just inside the function, but useable throughout my code).
So far I am able to locate my file, but I am having trouble to assign the filepath to a variable, so that I can read in that variabel with pd.read_csv()
the code that I have is at follows:
import pandas as pd
import os
import re
import win32api
# https://stackoverflow.com/questions/13067686/search-files-in-all-drives-using-python
def find_file(root_folder, rex):
for root,dirs,files in os.walk(root_folder):
for f in files:
result = rex.search(f)
if result:
print(os.path.join(root, f))
return result
break # if you want to find only one
def find_file_in_all_drives(file_name):
#create a regular expression for the file
rex = re.compile(file_name)
for drive in win32api.GetLogicalDriveStrings().split('\000')[:-1]:
find_file( drive, rex )
return
#file_name = "AB_NYC_2019.csv"
#find_file_in_all_drives(file_name)
df_location = find_file_in_all_drives( "AB_NYC_2019.csv" )
df = pd.read_csv(df_location)
I think that something is not right with the return.
Thank you for your time.
Right now it returns "None"
You haven't returned anything from anywhere.
I'm considering your code to be working and I've placed the necessary return calls but haven't tested it:
def find_file(root_folder, rex):
for root, dirs, files in os.walk(root_folder):
for f in files:
result = rex.search(f)
if result:
file_path = os.path.join(root, f)
return file_path
def find_file_in_all_drives(file_name):
matching_files = list()
# create a regular expression for the file
rex = re.compile(file_name)
for drive in win32api.GetLogicalDriveStrings().split('\000')[:-1]:
file_path = find_file(drive, rex)
if file_path:
matching_files.append(file_path)
return matching_files
df_location = find_file_in_all_drives("AB_NYC_2019.csv")
first_file_df = pd.read_csv(df_location[0])

Is there a way to load data from all files in a directory using Python?

My question: Is there a way to load data from all files in a directory using Python
Input: Get all files in a given directory of mine (wow.txt, testting.txt,etc.)
Process: I want to run all the files through a def function
Output: I want the output to be all the files names and their respective content below it.For example:
/home/file/wow.txt
"all of its content"
/home/file/www.txt
"all of its content"
Here is my code:
# Import Functions
import os
import sys
# Define the file path
path="/home/my_files"
file_name="wow.txt"
#Load Data Function
def load_data(path,file_name):
"""
Input : path and file_name
Purpose: loading text file
Output : list of paragraphs/documents and
title(initial 100 words considered as title of document)
"""
documents_list = []
titles=[]
with open( os.path.join(path, file_name) ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
#Output
load_data(path,file_name)
Here is my output:
My Problem is that my output only takes one file and shows its content. Obviously, i defined the path and file name in my code to one file but I am confused as to how to write the path in a way to load all the files and output each of its contents separately. Any suggestions?
Using glob:
import glob
files = glob.glob("*.txt") # get all the .txt files
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Using os.listdir():
import os
arr = os.listdir()
files = [x for x in arr if x.endswith('.txt')]
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Try this:
import glob
for file in glob.glob("test/*.xyz"):
print(file)
if my directory name was "test" and I had lots of xyz files in them...
You can use glob and pandas
import pandas as pd
import glob
path = r'some_directory' # use your path
all_files = glob.glob(path + "/*.txt")
li = []
for filename in all_files:
#read file here
# if you decide to use pandas you might need to use the 'sep' paramaeter as well
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
# get it all together
frame = pd.concat(li, axis=0, ignore_index=True)
I will take advantage of the function you have already written, so use the following:
data = []
path="/home/my_files"
dirs = os.listdir( path )
for file in dirs:
data.append(load_data(path, file))
In this case you will have all data in the list data.
Hi you can use a for loop on a listdir:
os.listdir(<path of your directory>)
this gives you the list of files in your directory, but this gives you also the name of folders in that directory
Try generating a file list first, then passing that to a modified version of your function.
def dir_recursive(dirName):
import os
import re
fileList = list()
for (dir, _, files) in os.walk(dirName):
for f in files:
path = os.path.join(dir, f)
if os.path.exists(path):
fileList.append(path)
fList = list()
prog = re.compile('.txt$')
for k in range(len(fileList)):
binMatch = prog.search(fileList[k])
if binMatch:
fList.append(binMatch.string)
return fList
def load_data2(file_list):
documents_list = []
titles=[]
for file_path in file_list:
with open( file_path ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
# Generate a file list & load the data from it
file_list = dir_recursive(path)
documents_list, titles = load_data2(file_list)

Read CSV starting with string from Zipfile

I'm trying to loop through a folder that has zip files in it, and only extracting the csv files that start with a certain prefix.
Here is the code:
for name in glob.glob(path + '/*.zip'):
zf = zipfile.ZipFile(name)
csv_file = pd.read_csv(zf.open('Common_MarketResults*.csv'))
df = pd.concat(csv_file, axis=0).reset_index()
The csv file has some dates after the string I am using, which will be different in every zip file. I am receiving the following error message:
KeyError: "There is no item named 'Common_MarketResults*.csv' in the archive"
Searching for substrings in the filename made this possible.
sub = 'Common_MarketResults'
suf = 'csv'
data = []
for name in glob.glob(path + '*.zip'):
zf = zipfile.ZipFile(name)
zf_nfo = zipfile.ZipFile(name).namelist()
for s in zf_nfo:
if sub in s and suf in s:
csv_file_str = s
csv_file = pd.read_csv(zf.open(csv_file_str))
csv_file['file_name'] = csv_file_str
data.append(csv_file)

How do I apply my python code to all of the files in a folder at once, and how do I create a new name for each subsequent output file?

The code I am working with takes in a .pdf file, and outputs a .txt file. My question is, how do I create a loop (probably a for loop) which runs the code over and over again on all files in a folder which end in ".pdf"? Furthermore, how do I change the output each time the loop runs so that I can write a new file each time, that has the same name as the input file (ie. 1_pet.pdf > 1_pet.txt, 2_pet.pdf > 2_pet.txt, etc.)
Here is the code so far:
path="2_pet.pdf"
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
The following script solve your problem:
import os
sourcedir = 'pdfdir'
dl = os.listdir('pdfdir')
for f in dl:
fs = f.split(".")
if fs[1] == "pdf":
path_in = os.path.join(dl,f)
content = getPDFContent(path_in)
encoded = content.encode("utf-8")
path_out = os.path.join(dl,fs[0] + ".txt")
text_file = open(path_out, 'w')
text_file.write(encoded)
text_file.close()
Create a function that encapsulates what you want to do to each file.
import os.path
def parse_pdf(filename):
"Parse a pdf into text"
content = getPDFContent(filename)
encoded = content.encode("utf-8")
## split of the pdf extension to add .txt instead.
(root, _) = os.path.splitext(filename)
text_file = open(root + ".txt", "w")
text_file.write(encoded)
text_file.close()
Then apply this function to a list of filenames, like so:
for f in files:
parse_pdf(f)
One way to operate on all PDF files in a directory is to invoke glob.glob() and iterate over the results:
import glob
for path in glob.glob('*.pdf')
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
Another way is to allow the user to specify the files:
import sys
for path in sys.argv[1:]:
...
Then the user runs your script like python foo.py *.pdf.
You could use a recursive function to search the folders and all subfolders for files that end with pdf. Than take those files and then create a text file for it.
It could be something like:
import os
def convert_PDF(path, func):
d = os.path.basename(path)
if os.path.isdir(path):
[convert_PDF(os.path.join(path,x), func) for x in os.listdir(path)]
elif d[-4:] == '.pdf':
funct(path)
# based entirely on your example code
def convert_to_txt(path):
content = getPDFContent(path)
encoded = content.encode("utf-8")
file_path = os.path.dirname(path)
# replace pdf with txt extension
file_name = os.path.basename(path)[:-4]+'.txt'
text_file = open(file_path +'/'+file_name, "w")
text_file.write(encoded)
text_file.close()
convert_PDF('path/to/files', convert_to_txt)
Because the actual operation is changeable, you can replace the function with whatever operation you need to perform (like using a different library, converting to a different type, etc.)

Python - stuck at reading a specific row from a csv

I need to add columns to a "matched" shape file based on a csv. I have one last step to complete which is to get the value to enter into the shp from the csv.
I get
readCSV[rowID] Traceback (most recent call last): File "", line 1, in TypeError: '_csv.reader'
object is not subscriptable
The stripped down CSV is
The files look like
The code mataches OVL_CAT + OVL2_DESC to the File Name.
I then get the code to add a column called LGA_CODE and need to populate it with '583094' which is row 2, column 1...how do I get this when I can't call FileList2 to get row 2 from the csv (3 in the example below but 2 in python)?
import os, sys, datetime, csv, arcpy, string
from subprocess import Popen
from itertools import islice
top = os.getcwd() # change to a specific path if required.
# This otherwise starts with the directory the script is in (preferred).
RootOutput = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch' # change if you want output somewhere else
top=RootOutput
SourceDIR = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch\OM_011' # source of your data (subdirectories searched as well
outDIR = top+"\\workingFiles" # directory where output is written to. Includes temp files
finalDIR = top+"\\final" # folder for final data only
AOI = 'AOI.csv' # name of the file containing the las file names in the second column
Compare_Column = 2
Compare_Column2 = 3
# END setting base paths
# NOTHING BELOW should need editing.
FileTypes=['shp']
SearchStrings=[]
filecount=0
List =[]
count=0
x=0
os.chdir(top)
#Generate list with unique file name codes from CSV
FileList = csv.reader(open(AOI))
SearchStrings=[]
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rowID=rowID+1
for root, dirs, files in os.walk(SourceDIR, topdown=False):
for fl in files:
currentFile=os.path.join(root, fl)
for FileType in FileTypes:
status= str.endswith(currentFile,FileType)
if str(status) == 'True':
List.append(currentFile)
for SearchString in SearchStrings:
#print currentFile
#print SearchString
if str(SearchString in currentFile) == 'True':
#print str(currentFile)+str(status)
List.append(currentFile)
filecount=filecount+1
#del fl
# Get list of Column Names
headers_count = 1
with open(AOI) as fin:
headers = list(islice(fin, headers_count))
delimiter=','
header=str(headers)
header_list=header.split(delimiter)
# Process matching files
for fl in List:
header_count=0
for header in header_list:
dfStore=fl
#arcpy.AddField_management(dfStore, str(header) ,'TEXT')
# Get RowID to read column data from
filename=fl[fl.rfind('\\')+1:fl.rfind('_')]
for field in SearchStrings:
#print field, filename
if field.endswith(filename):
rowID=field[:field.find('_')]
with open(AOI, 'rb') as f:
readCSV= csv.reader(f)
text=readCSV[rowID][1]
## arcpy.CalculateField_management(fl, header, text,"PYTHON_9.3")
=== UPDATED CODE BASED ON COMMENTS -it's all working find if anyone needs it.
import os, sys, datetime, csv, arcpy, string
from subprocess import Popen
from itertools import islice
top = os.getcwd() # change to a specific path if required.
# This otherwise starts with the directory the script is in (preferred).
RootOutput = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch' # change if you want output somewhere else
top=RootOutput
SourceDIR = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch\OM_011' # source of your data (subdirectories searched as well
outDIR = top+"\\workingFiles" # directory where output is written to. Includes temp files
finalDIR = top+"\\final" # folder for final data only
AOI = 'AOI.csv' # name of the file containing the las file names in the second column
Compare_Column = 3
Compare_Column2 = 4
# END setting base paths
# NOTHING BELOW should need editing.
FileTypes=['shp']
SearchStrings=[]
filecount=0
List =[]
count=0
x=0
os.chdir(top)
#Generate list with unique file name codes from CSV
FileList = csv.reader(open(AOI))
SearchStrings=[]
rows=[]
#FinalList=[]
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rows.append(File)
#FinalList.append()
rowID+=1
for root, dirs, files in os.walk(SourceDIR, topdown=False):
for fl in files:
currentFile=os.path.join(root, fl)
for FileType in FileTypes:
status= str.endswith(currentFile,FileType)
if status:
#List.append(currentFile)
for SearchString in SearchStrings:
#print currentFile, SearchString
if str(SearchString[SearchString.find('_')+1:] in currentFile) == 'True':
#print str(currentFile)+str(status)
List.append(currentFile)
filecount=filecount+1
#del fl
# Get list of Column Names
headers_count = 1
with open(AOI) as fin:
headers = list(islice(fin, headers_count))
delimiter=','
header=str(headers)
header_listT=header.split(delimiter)
header_list=[]
for hdr in header_listT:
header_list.append(arcpy.ValidateTableName(hdr)[:10])
# Process matching files
columnID=1
for fl in List:
header_count=0
for header in header_list:
print header
dfStore=fl
try:
arcpy.AddField_management(dfStore, str(header) ,'TEXT')
except:
pass
# Get RowID to read column data from
filename=fl[fl.rfind('\\')+1:fl.rfind('_')]
for field in SearchStrings:
#print field, filename
#print header, field
if field.endswith(filename):
#print 'FOUND......................'
column_count=len(fl)
if columnID < len(header_list):
rowID=int(field[:field.find('_')])
text = rows[rowID][columnID]
print filename, header, text
columnID+=1
arcpy.CalculateField_management(fl, header, "text" ,"PYTHON_9.3")
#arcpy.CalculateField_management("P:/2012/273_CCRC_Townplanning_Datasets/Working/scratch/OM_011/OM_011_Waterway_Envelopes_ccrc.shp","LGA_CODE","5","PYTHON","#")
Your problem is in these two lines:
readCSV= csv.reader(f)
text=readCSV[rowID][1]
csv.reader is an iterable over the lines of the file; it cannot be directly indexed. You could use islice to get the element you want (islice(readCSV, rowID, rowID+1).next()), though a neater solution would just be to store a dictionary mapping rowID to the AOI row when you read it the first time (in the SearchStrings loop):
FileList = csv.reader(open(AOI))
SearchStrings = []
rows = []
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rows.append(File)
rowID=rowID+1
... # later
rowID=int(field[:field.find('_')])
text = rows[rowID][1]

Categories

Resources