I'm sure this is a simple thing to do but I don't know how. What I want to achieve is something like this:
templateFilename = str( templateFilename )
# If no file extension is found, assume it is a .npy file
if templateFilename.endswith( '.*' ):
templateFilename += ".npy"
However, this syntax doesn't seem to work. I want the * to represent any file extension so that, if the parsed file does contain a file extension, that one will be used but, if not, a standard extension will be added.
I have read about the glob module and people seem to be using that for finding things such as *.txt, etc. but I'm not sure how it works.
I would suggest os.path.splitext. The following uses .npy as the extension if none exists:
root, ext = os.path.splitext(path)
if not ext:
ext = '.npy'
path = root + ext
(Speaking from experience and hair-loss)
Doing a split on . and then selecting the second element [1] will only work if you can absolutely guarantee that there are no . in the filename; otherwise you'll need something like this:
file_extension = [".csv", ".xml", ".html"]
if '.' in templateFilename: #checks if you can actually split, if you can't perform a split; you would raise an index error.
if templateFilename.split(".")[-1] in file_extension: #[-1] = the last element in the list.
has_extension = true
has_verified_extension = true
else:
has_extension = true
has_verified_extension = false
else: #no '.'. in the filename, so no extension.
has_extension = false
Usage:
file_extension = [".pyo", ".npy", ".py"]
templateFilename = str( templateFilename )
# If no file extension is found, assume it is a .npy file
if not templateFilename.split(".")[1] in file_extension:
templateFilename += ".npy"
If you want in one line then here it is :
templatefilename = "abcd"
non_ext_file_list = [filename + ".npy" for filename in templateFilename.split(".") if not "." in templateFilename]
#output
[abcd.npy]
Trying to create a simple code, to batch rename a folder in windows.
Musts:
change every number , like "file02.txt", turn 02 into 0002
maybe work for every file format, like jpg, png, txt, docx and so on (becuse I'm not sure what will be in the folder, this code might be used for image sequences...)
Is this possible?
I did test versions, combination of the little knowledge I have, but it gets me confused.
my code so far:
import os
import sys
folder_path = os.listdir(raw_input("Insert folder path: "))
print "Files in folder: %s" % folder_path
# a split tool
def mysplit(s):
head = s.rstrip('0123456789')
tail = s[len(head):]
return head, tail
# a function to make a new name with needed 0000
def new_filename(filename):
file_name_part, ext = os.path.splitext(filename) # file01 and .ext
original_name, number = mysplit(file_name_part) # 01 and file
add_zero = number.rjust(4, "0") # add 0001
new_name = original_name + add_zero + ext
print new_name
# new_name comes like this ['file0001.txt'] but seperate, not in a list? Why?
for current_file_n in folder_path:
new = new_filename(current_file_n)
print list([new]) # trying to make the str into a list....
re_name = os.renames(current_file_n, new)
print re_name
print "Renamed files: %s" % folder_path
The desired outcome is the same as the beginning list, but collated with zeros,like this: ['file0001.txt', 'file0002.txt', 'file0003.txt'......'file0015.txt']
I've got errors like windows error: can't find file, and another error; can't connect str and list?
I need an explanation of what I'm doing wrong as simple as possible, or is there another method that I can use that will give me the desired outcome?
As martineau said your indentation is messed up.
Here's the working code:
import os
import sys
# a split tool
def mysplit(s):
head = s.rstrip('0123456789')
tail = s[len(head):]
return head, tail
# a function to make a new name with needed 0000
def new_filename(filename):
file_name_part, ext = os.path.splitext(filename) # file01 and .ext
original_name, number = mysplit(file_name_part) # 01 and file
add_zero = number.rjust(4, "0") # add 0001
new_name = original_name + add_zero + ext
return new_name
# new_name comes like this ['file0001.txt'] but seperate, not in a list? Why?
if __name__ == '__main__':
folder_path = os.listdir(raw_input("Insert folder path: "))
print "Files in folder: %s" % folder_path
renamed_files = []
for current_file_n in folder_path:
new = new_filename(current_file_n)
renamed_files.append(new) # Add renamed file's name to a list
try:
os.renames(current_file_n, new) #It doesn't return anything
print new
except:
print "Unexpected error while renaming %s:%s"%(new, sys.exc_info()[0])
print "Renamed files: %s" % renamed_files
Hope this helps
Your code can be simplified a lot by using regular expression substitution. re.sub() can take a replacement function. In this case adding leading zeroes to the first number found in the filename.
import os, re
def renumber_files(directory, zeroes=4):
os.chdir(directory)
for filename in os.listdir(directory):
new_name = re.sub(r'\d+', lambda m: m.group().zfill(zeroes), filename, count=1)
os.rename(filename, new_name)
renumber_files(raw_input("Insert folder path: "))
This works because re.sub() can take a callable as the replacement argument.
Signature: re.sub(pattern, repl, string, count=0, flags=0)
Return the string obtained by replacing the leftmost non-overlapping
occurrences of the pattern in string by the replacement repl. repl
can be either a string or a callable; if a string, backslash escapes
in it are processed. If it is a callable, it's passed the match
object and must return a replacement string to be used.
In the lambda m.group() returns a string matching the pattern \d+. For instance "1", "564645" or "005".
The next step, str.zfill(4), turns those into "0001", "564645", or "0005".
To secure uploaded image names, I'd like to strip out image's filenames from anything but string.ascii_letters , string.digits, dot and (one) whitespace.
So I'm wondering what is the best method to check a text against other characters?
import re
import os
s = 'asodgnasAIDID12313%*(#&(!$ 1231'
result = re.sub('[^a-zA-Z\d\. ]|( ){2,}','',s )
if result =='' or os.path.splitext(result)[0].isspace():
print "not a valid name"
else:
print "valid name"
EDIT:
changed it so it will also whitelist only one whitespace + added import re
Not sure if it's what you need but give it a try:
import sys, os
fileName, fileExtension = os.path.splitext('image 11%%22.jpg')
fileExtension = fileExtension.encode('ascii', 'ignore')
fileName = fileName.encode('ascii', 'ignore')
if fileExtension[1:] in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'tga']:
fileName = ''.join(e for e in fileName if e.isalnum())
print fileName+fileExtension
#image1122.jpg
else:
print "Extension not supported"
isalnum()
https://docs.python.org/2/library/stdtypes.html#str.isalnum
I wouldn't use regex for this. The only tricky requirement is the single space, but that can be done, too.
import string
whitelist = set(string.ascii_letters + string.digits)
good_filename = "herearesomelettersand123numbers andonespace"
bad_filename = "symbols&#! and more than one space"
def strip_filename(fname, whitelist):
"""Strips a filename
Removes any character from string `fname` and removes all but one
whitespace.
"""
whitelist.add(" ")
stripped = ''.join([ch for ch in fname if ch in whitelist])
split = stripped.split()
result = " ".join([split[0], ''.join(split[1:])])
return result
Then call it with:
good_sanitized = strip_filename(good_filename, whitelist)
bad_sanitized = strip_filename(bad_filename, whitelist)
print(good_sanitized)
# 'herearesomelettersand123numbers andonespace'
print(bad_sanitized)
# 'symbols andmorethanonespace'
The code I am working with takes in a .pdf file, and outputs a .txt file. My question is, how do I create a loop (probably a for loop) which runs the code over and over again on all files in a folder which end in ".pdf"? Furthermore, how do I change the output each time the loop runs so that I can write a new file each time, that has the same name as the input file (ie. 1_pet.pdf > 1_pet.txt, 2_pet.pdf > 2_pet.txt, etc.)
Here is the code so far:
path="2_pet.pdf"
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
The following script solve your problem:
import os
sourcedir = 'pdfdir'
dl = os.listdir('pdfdir')
for f in dl:
fs = f.split(".")
if fs[1] == "pdf":
path_in = os.path.join(dl,f)
content = getPDFContent(path_in)
encoded = content.encode("utf-8")
path_out = os.path.join(dl,fs[0] + ".txt")
text_file = open(path_out, 'w')
text_file.write(encoded)
text_file.close()
Create a function that encapsulates what you want to do to each file.
import os.path
def parse_pdf(filename):
"Parse a pdf into text"
content = getPDFContent(filename)
encoded = content.encode("utf-8")
## split of the pdf extension to add .txt instead.
(root, _) = os.path.splitext(filename)
text_file = open(root + ".txt", "w")
text_file.write(encoded)
text_file.close()
Then apply this function to a list of filenames, like so:
for f in files:
parse_pdf(f)
One way to operate on all PDF files in a directory is to invoke glob.glob() and iterate over the results:
import glob
for path in glob.glob('*.pdf')
content = getPDFContent(path)
encoded = content.encode("utf-8")
text_file = open("Output.txt", "w")
text_file.write(encoded)
text_file.close()
Another way is to allow the user to specify the files:
import sys
for path in sys.argv[1:]:
...
Then the user runs your script like python foo.py *.pdf.
You could use a recursive function to search the folders and all subfolders for files that end with pdf. Than take those files and then create a text file for it.
It could be something like:
import os
def convert_PDF(path, func):
d = os.path.basename(path)
if os.path.isdir(path):
[convert_PDF(os.path.join(path,x), func) for x in os.listdir(path)]
elif d[-4:] == '.pdf':
funct(path)
# based entirely on your example code
def convert_to_txt(path):
content = getPDFContent(path)
encoded = content.encode("utf-8")
file_path = os.path.dirname(path)
# replace pdf with txt extension
file_name = os.path.basename(path)[:-4]+'.txt'
text_file = open(file_path +'/'+file_name, "w")
text_file.write(encoded)
text_file.close()
convert_PDF('path/to/files', convert_to_txt)
Because the actual operation is changeable, you can replace the function with whatever operation you need to perform (like using a different library, converting to a different type, etc.)
Is there a function to extract the extension from a filename?
Use os.path.splitext:
>>> import os
>>> filename, file_extension = os.path.splitext('/path/to/somefile.ext')
>>> filename
'/path/to/somefile'
>>> file_extension
'.ext'
Unlike most manual string-splitting attempts, os.path.splitext will correctly treat /a/b.c/d as having no extension instead of having extension .c/d, and it will treat .bashrc as having no extension instead of having extension .bashrc:
>>> os.path.splitext('/a/b.c/d')
('/a/b.c/d', '')
>>> os.path.splitext('.bashrc')
('.bashrc', '')
New in version 3.4.
import pathlib
print(pathlib.Path('yourPath.example').suffix) # '.example'
print(pathlib.Path("hello/foo.bar.tar.gz").suffixes) # ['.bar', '.tar', '.gz']
I'm surprised no one has mentioned pathlib yet, pathlib IS awesome!
import os.path
extension = os.path.splitext(filename)[1]
import os.path
extension = os.path.splitext(filename)[1][1:]
To get only the text of the extension, without the dot.
For simple use cases one option may be splitting from dot:
>>> filename = "example.jpeg"
>>> filename.split(".")[-1]
'jpeg'
No error when file doesn't have an extension:
>>> "filename".split(".")[-1]
'filename'
But you must be careful:
>>> "png".split(".")[-1]
'png' # But file doesn't have an extension
Also will not work with hidden files in Unix systems:
>>> ".bashrc".split(".")[-1]
'bashrc' # But this is not an extension
For general use, prefer os.path.splitext
worth adding a lower in there so you don't find yourself wondering why the JPG's aren't showing up in your list.
os.path.splitext(filename)[1][1:].strip().lower()
Any of the solutions above work, but on linux I have found that there is a newline at the end of the extension string which will prevent matches from succeeding. Add the strip() method to the end. For example:
import os.path
extension = os.path.splitext(filename)[1][1:].strip()
You can find some great stuff in pathlib module (available in python 3.x).
import pathlib
x = pathlib.PurePosixPath("C:\\Path\\To\\File\\myfile.txt").suffix
print(x)
# Output
'.txt'
With splitext there are problems with files with double extension (e.g. file.tar.gz, file.tar.bz2, etc..)
>>> fileName, fileExtension = os.path.splitext('/path/to/somefile.tar.gz')
>>> fileExtension
'.gz'
but should be: .tar.gz
The possible solutions are here
Although it is an old topic, but i wonder why there is none mentioning a very simple api of python called rpartition in this case:
to get extension of a given file absolute path, you can simply type:
filepath.rpartition('.')[-1]
example:
path = '/home/jersey/remote/data/test.csv'
print path.rpartition('.')[-1]
will give you: 'csv'
Just join all pathlib suffixes.
>>> x = 'file/path/archive.tar.gz'
>>> y = 'file/path/text.txt'
>>> ''.join(pathlib.Path(x).suffixes)
'.tar.gz'
>>> ''.join(pathlib.Path(y).suffixes)
'.txt'
Surprised this wasn't mentioned yet:
import os
fn = '/some/path/a.tar.gz'
basename = os.path.basename(fn) # os independent
Out[] a.tar.gz
base = basename.split('.')[0]
Out[] a
ext = '.'.join(basename.split('.')[1:]) # <-- main part
# if you want a leading '.', and if no result `None`:
ext = '.' + ext if ext else None
Out[] .tar.gz
Benefits:
Works as expected for anything I can think of
No modules
No regex
Cross-platform
Easily extendible (e.g. no leading dots for extension, only last part of extension)
As function:
def get_extension(filename):
basename = os.path.basename(filename) # os independent
ext = '.'.join(basename.split('.')[1:])
return '.' + ext if ext else None
You can use a split on a filename:
f_extns = filename.split(".")
print ("The extension of the file is : " + repr(f_extns[-1]))
This does not require additional library
filename='ext.tar.gz'
extension = filename[filename.rfind('.'):]
Extracting extension from filename in Python
Python os module splitext()
splitext() function splits the file path into a tuple having two values – root and extension.
import os
# unpacking the tuple
file_name, file_extension = os.path.splitext("/Users/Username/abc.txt")
print(file_name)
print(file_extension)
Get File Extension using Pathlib Module
Pathlib module to get the file extension
import pathlib
pathlib.Path("/Users/pankaj/abc.txt").suffix
#output:'.txt'
Even this question is already answered I'd add the solution in Regex.
>>> import re
>>> file_suffix = ".*(\..*)"
>>> result = re.search(file_suffix, "somefile.ext")
>>> result.group(1)
'.ext'
This is a direct string representation techniques :
I see a lot of solutions mentioned, but I think most are looking at split.
Split however does it at every occurrence of "." .
What you would rather be looking for is partition.
string = "folder/to_path/filename.ext"
extension = string.rpartition(".")[-1]
Another solution with right split:
# to get extension only
s = 'test.ext'
if '.' in s: ext = s.rsplit('.', 1)[1]
# or, to get file name and extension
def split_filepath(s):
"""
get filename and extension from filepath
filepath -> (filename, extension)
"""
if not '.' in s: return (s, '')
r = s.rsplit('.', 1)
return (r[0], r[1])
you can use following code to split file name and extension.
import os.path
filenamewithext = os.path.basename(filepath)
filename, ext = os.path.splitext(filenamewithext)
#print file name
print(filename)
#print file extension
print(ext)
A true one-liner, if you like regex.
And it doesn't matter even if you have additional "." in the middle
import re
file_ext = re.search(r"\.([^.]+)$", filename).group(1)
See here for the result: Click Here
Well , i know im late
that's my simple solution
file = '/foo/bar/whatever.ext'
extension = file.split('.')[-1]
print(extension)
#output will be ext
try this:
files = ['file.jpeg','file.tar.gz','file.png','file.foo.bar','file.etc']
pen_ext = ['foo', 'tar', 'bar', 'etc']
for file in files: #1
if (file.split(".")[-2] in pen_ext): #2
ext = file.split(".")[-2]+"."+file.split(".")[-1]#3
else:
ext = file.split(".")[-1] #4
print (ext) #5
get all file name inside the list
splitting file name and check the penultimate extension, is it in the pen_ext list or not?
if yes then join it with the last extension and set it as the file's extension
if not then just put the last extension as the file's extension
and then check it out
You can use endswith to identify the file extension in python
like bellow example
for file in os.listdir():
if file.endswith('.csv'):
df1 =pd.read_csv(file)
frames.append(df1)
result = pd.concat(frames)
For funsies... just collect the extensions in a dict, and track all of them in a folder. Then just pull the extensions you want.
import os
search = {}
for f in os.listdir(os.getcwd()):
fn, fe = os.path.splitext(f)
try:
search[fe].append(f)
except:
search[fe]=[f,]
extensions = ('.png','.jpg')
for ex in extensions:
found = search.get(ex,'')
if found:
print(found)
This method will require a dictonary, list, or set. you can just use ".endswith" using built in string methods. This will search for name in list at end of file and can be done with just str.endswith(fileName[index]). This is more for getting and comparing extensions.
https://docs.python.org/3/library/stdtypes.html#string-methods
Example 1:
dictonary = {0:".tar.gz", 1:".txt", 2:".exe", 3:".js", 4:".java", 5:".python", 6:".ruby",7:".c", 8:".bash", 9:".ps1", 10:".html", 11:".html5", 12:".css", 13:".json", 14:".abc"}
for x in dictonary.values():
str = "file" + x
str.endswith(x, str.index("."), len(str))
Example 2:
set1 = {".tar.gz", ".txt", ".exe", ".js", ".java", ".python", ".ruby", ".c", ".bash", ".ps1", ".html", ".html5", ".css", ".json", ".abc"}
for x in set1:
str = "file" + x
str.endswith(x, str.index("."), len(str))
Example 3:
fileName = [".tar.gz", ".txt", ".exe", ".js", ".java", ".python", ".ruby", ".c", ".bash", ".ps1", ".html", ".html5", ".css", ".json", ".abc"];
for x in range(0, len(fileName)):
str = "file" + fileName[x]
str.endswith(fileName[x], str.index("."), len(str))
Example 4
fileName = [".tar.gz", ".txt", ".exe", ".js", ".java", ".python", ".ruby", ".c", ".bash", ".ps1", ".html", ".html5", ".css", ".json", ".abc"];
str = "file.txt"
str.endswith(fileName[1], str.index("."), len(str))
Examples 5, 6, 7 with output
Example 8
fileName = [".tar.gz", ".txt", ".exe", ".js", ".java", ".python", ".ruby", ".c", ".bash", ".ps1", ".html", ".html5", ".css", ".json", ".abc"];
exts = []
str = "file.txt"
for x in range(0, len(x)):
if str.endswith(fileName[1]) == 1:
exts += [x]
The easiest way to get is to use mimtypes, below is the example:
import mimetypes
mt = mimetypes.guess_type("file name")
file_extension = mt[0]
print(file_extension)
Here if you want to extract the last file extension if it has multiple
class functions:
def listdir(self, filepath):
return os.listdir(filepath)
func = functions()
os.chdir("C:\\Users\Asus-pc\Downloads") #absolute path, change this to your directory
current_dir = os.getcwd()
for i in range(len(func.listdir(current_dir))): #i is set to numbers of files and directories on path directory
if os.path.isfile((func.listdir(current_dir))[i]): #check if it is a file
fileName = func.listdir(current_dir)[i] #put the current filename into a variable
rev_fileName = fileName[::-1] #reverse the filename
currentFileExtension = rev_fileName[:rev_fileName.index('.')][::-1] #extract from beginning until before .
print(currentFileExtension) #output can be mp3,pdf,ini,exe, depends on the file on your absolute directory
Output is mp3, even works if has only 1 extension name
I'm definitely late to the party, but in case anyone wanted to achieve this without the use of another library:
file_path = "example_tar.tar.gz"
file_name, file_ext = [file_path if "." not in file_path else file_path.split(".")[0], "" if "." not in file_path else file_path[file_path.find(".") + 1:]]
print(file_name, file_ext)
The 2nd line is basically just the following code but crammed into one line:
def name_and_ext(file_path):
if "." not in file_path:
file_name = file_path
else:
file_name = file_path.split(".")[0]
if "." not in file_path:
file_ext = ""
else:
file_ext = file_path[file_path.find(".") + 1:]
return [file_name, file_ext]
Even though this works, it might not work will all types of files, specifically .zshrc, I would recomment using os's os.path.splitext function, example below:
import os
file_path = "example.tar.gz"
file_name, file_ext = os.path.splitext(file_path)
print(file_name, file_ext)
Cheers :)
# try this, it works for anything, any length of extension
# e.g www.google.com/downloads/file1.gz.rs -> .gz.rs
import os.path
class LinkChecker:
#staticmethod
def get_link_extension(link: str)->str:
if link is None or link == "":
return ""
else:
paths = os.path.splitext(link)
ext = paths[1]
new_link = paths[0]
if ext != "":
return LinkChecker.get_link_extension(new_link) + ext
else:
return ""
def NewFileName(fichier):
cpt = 0
fic , *ext = fichier.split('.')
ext = '.'.join(ext)
while os.path.isfile(fichier):
cpt += 1
fichier = '{0}-({1}).{2}'.format(fic, cpt, ext)
return fichier