Creating a list of files and separate the extension - python

So, in the folder INFACT, I have the following files(and many more with the same extensions):
BFDRYCKSE.ad
BFDRYCKSE.txt
BFFALIV2SE.ad
BFFALIV2SE.txt
I need to zip those files according to the filename, but only those mentioned above. My final result should be:
BFDRYCKSE.zip, contains(BFDRYCKSE.ad, BFDRYCKSE.txt)
BFFALIV2SE.zip, contains(BFFALIV2SE.ad BFFALIV2SE.txt)
Here's my code:
import os
import glob
import zipfile
setfile = r'C:\Users\vijo9001\Desktop\Infact'
myset = [
"BFBRODSE",
"BFDRYCKSE",
"BFFALIV2SE",
"BFFALIVSSE",
"BFFRYSTSE",
"BFHUSHA1SE",
"BFHUSHA2SE",
"BFHUSHALSE",
"BFKONFEKSE",
"BFKROPPVSE",
"BFKROPP2SE",
"BFLIV2SE",
"BFLIVSSE",
"BFMAKEUPSE",
"BFMEJERISE",
"BFTOBAKSE"
]
os.chdir(setfile)
list_of_lists = []
for i, pattern in enumerate(myset):
list_of_files = glob.glob(r'*{pattern}*'.format(pattern=myset[i]))
list_of_lists.append(list_of_files)
n = 0
file = os.path.splitext(list_of_files[0])[0]
with zipfile.ZipFile(file + '.zip', 'w') as myzip:
for f in list_of_files:
myzip.write(f, compress_type=zipfile.ZIP_DEFLATED)
I keep getting
Traceback (most recent call last):
File "C:/Users/vijo9001/Desktop/Retailers Check/aaa.py", line 29, in <module>
file = os.path.splitext(list_of_files[0])[0]
IndexError: list index out of range
Why is that?

I don't think you need to use an enumerator to access your list elements.
Your problem statement says that you want each zipfile to have the same basename as the .ad and .txt files which it should contain. With that in mind I'd try something along these lines instead
for basename in myset:
filelist = glob.glob(r'*{pattern}*'.format(basename))
with zipfile.ZipFile(basename + '.zip', 'w') as myzip:
for f in filelist:
myzip.write(f, compress_type=zipfile.ZIP_DEFLATED)

My solution is without defining set names:
=^..^=
import os
import zipfile
# get all files from directory
files_list = os.listdir(".")
# collect valid files
files_to_pack = []
for item in files_list:
try:
file_name, file_extension = item.split('.')
if file_extension == 'ad' or file_extension == 'txt':
files_to_pack.append(item)
except:
pass
# sort and pair files
sorted_files = sorted(files_to_pack)
pairs_files = [sorted_files[i:2+i] for i in range(0,len(sorted_files),2)]
# zip paired files
for item in pairs_files:
with zipfile.ZipFile(item[0].split('.')[0] + '.zip', 'w') as myzip:
myzip.write(item[0])
myzip.write(item[1])
myzip.close()

Related

Python to search (text) files recursively and perform user-id deletion

OS: Ubuntu-18.04lts
Python version - 3.6.9
Excel report data.xlsx i have is,
I have many text files under /home/user/excel/report/directory and inside its sub-directories. Some of the text files along with path given below for reference.
/home/user/excel/report/file01.txt
/home/user/excel/report/folder-1/file02.txt
/home/user/excel/report/folder-1/filepath/file03.txt
/home/user/excel/report/folder-2/file04.txt
The filename of the text files are in excel sheet's B column. For each row, i need to a search the text file as per B column and need to look the User-ID in D column, if user-id exists in that particular row matched text file then user-id need to be removed from that text file, Same need to perform recursively.
Currently i below python code I'm using.
import os
import pandas as pd
data = pd.read_excel("data.xlsx")
d = dict(zip(data["File Name"], data["User-ID"]))
for file in d:
with open(f"/home/user/excel/report/" + file + ".txt", "r") as f:
contents = f.read().strip()
with open(f"/home/user/excel/report/" + file + ".txt", "w") as f:
f.write(contents.replace(d[file], ""))
Error:
$ python3.6 script.py
Traceback (most recent call last):
File "script.py", line 8, in <module>
with open(f"/home/user/excel/report/" + file + ".txt", "r") as f:
FileNotFoundError: [Errno 2] No such file or directory: '/home/user/excel/report/file03.txt'
Still script is look at /home/user/excel/report/directory only. Whereas file03.txt is present inside /home/user/excel/report/folder-1/filepath/ directory, Need help to fix this. Thanks.
You can use bash cmd find in python to find all the txt's paths.
import os
import pandas as pd
import subprocess
pycmd = lambda cmd: print(subprocess.check_output(cmd, shell=True, universal_newlines=True))
pycmd_output = lambda cmd: subprocess.check_output(cmd, shell=True, universal_newlines=True).strip().split('\n')
# use bash com find to find all txt file's path
os.chdir('/home/user/excel')
cmd = '''
find /home/user/excel/report/ -type f -name *.txt
'''
file_list = pycmd_output(cmd)
df_file_list = pd.Series(file_list)
file_list_name = df_file_list.str.split('/|\.').str[-2]
file_map = dict(zip(file_list_name ,df_file_list ))
# {'file02': '/home/user/excel/report/folder-1/file02.txt',
# 'file03': '/home/user/excel/report/folder-1/filepath/file03.txt',
# 'file04': '/home/user/excel/report/folder-2/file04.txt',
# 'file01': '/home/user/excel/report/file01.txt'}
data = pd.read_excel("data.xlsx")
data['file_path'] = data["File Name"].map(file_map)
# have duplicated values in `data["File Name"]`
d = data.groupby('file_path')['User-ID'].agg(list).to_dict()
for file, user_id_list in d.items():
with open(file, "r") as f:
contents = f.read().strip()
for user_id in user_id_list:
contents = contents.replace(user_id, "")
with open(file, "w") as f:
f.write(contents)
Considering
/home/user/excel/report/file01.txt
/home/user/excel/report/folder-1/file02.txt
/home/user/excel/report/folder-1/filepath/file03.txt
/home/user/excel/report/folder-2/file04.txt
you need first do discover where each file is located, if filenames are always unique this is relatively simple with os.walk. I would do:
import os
filepaths = {}
for dirpath, dirnames, filenames in os.walk("/home/user/excel/report"):
for fname in filenames:
filepaths[fname] = os.path.join(dirpath, fname)
print(filepaths)
which should created dict with keys being filenames and values paths to them. Then when you need to interact with file named "X" just use filepaths["X"].
If following code
import os
import pandas as pd
data = pd.read_excel("data.xlsx")
d = dict(zip(data["File Name"], data["User-ID"]))
for file in d:
with open(f"/home/user/excel/report/" + file + ".txt", "r") as f:
contents = f.read().strip()
with open(f"/home/user/excel/report/" + file + ".txt", "w") as f:
f.write(contents.replace(d[file], ""))
would work as intended if all files were inside /home/user/excel/report then following should work with files which might be in subdirs
import os
import pandas as pd
filepaths = {}
for dirpath, dirnames, filenames in os.walk("/home/user/excel/report"):
for fname in filenames:
filepaths[fname] = os.path.join(dirpath, fname)
data = pd.read_excel("data.xlsx")
d = dict(zip(data["File Name"], data["User-ID"]))
for file in d:
with open(filepaths[file+".txt"], "r") as f:
contents = f.read().strip()
with open(filepaths[file+".txt"], "w") as f:
f.write(contents.replace(d[file], ""))

Is there a way to load data from all files in a directory using Python?

My question: Is there a way to load data from all files in a directory using Python
Input: Get all files in a given directory of mine (wow.txt, testting.txt,etc.)
Process: I want to run all the files through a def function
Output: I want the output to be all the files names and their respective content below it.For example:
/home/file/wow.txt
"all of its content"
/home/file/www.txt
"all of its content"
Here is my code:
# Import Functions
import os
import sys
# Define the file path
path="/home/my_files"
file_name="wow.txt"
#Load Data Function
def load_data(path,file_name):
"""
Input : path and file_name
Purpose: loading text file
Output : list of paragraphs/documents and
title(initial 100 words considered as title of document)
"""
documents_list = []
titles=[]
with open( os.path.join(path, file_name) ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
#Output
load_data(path,file_name)
Here is my output:
My Problem is that my output only takes one file and shows its content. Obviously, i defined the path and file name in my code to one file but I am confused as to how to write the path in a way to load all the files and output each of its contents separately. Any suggestions?
Using glob:
import glob
files = glob.glob("*.txt") # get all the .txt files
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Using os.listdir():
import os
arr = os.listdir()
files = [x for x in arr if x.endswith('.txt')]
for file in files: # iterate over the list of files
with open(file, "r") as fin: # open the file
# rest of the code
Try this:
import glob
for file in glob.glob("test/*.xyz"):
print(file)
if my directory name was "test" and I had lots of xyz files in them...
You can use glob and pandas
import pandas as pd
import glob
path = r'some_directory' # use your path
all_files = glob.glob(path + "/*.txt")
li = []
for filename in all_files:
#read file here
# if you decide to use pandas you might need to use the 'sep' paramaeter as well
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
# get it all together
frame = pd.concat(li, axis=0, ignore_index=True)
I will take advantage of the function you have already written, so use the following:
data = []
path="/home/my_files"
dirs = os.listdir( path )
for file in dirs:
data.append(load_data(path, file))
In this case you will have all data in the list data.
Hi you can use a for loop on a listdir:
os.listdir(<path of your directory>)
this gives you the list of files in your directory, but this gives you also the name of folders in that directory
Try generating a file list first, then passing that to a modified version of your function.
def dir_recursive(dirName):
import os
import re
fileList = list()
for (dir, _, files) in os.walk(dirName):
for f in files:
path = os.path.join(dir, f)
if os.path.exists(path):
fileList.append(path)
fList = list()
prog = re.compile('.txt$')
for k in range(len(fileList)):
binMatch = prog.search(fileList[k])
if binMatch:
fList.append(binMatch.string)
return fList
def load_data2(file_list):
documents_list = []
titles=[]
for file_path in file_list:
with open( file_path ,"rt", encoding='latin-1') as fin:
for line in fin.readlines():
text = line.strip()
documents_list.append(text)
print("Total Number of Documents:",len(documents_list))
titles.append( text[0:min(len(text),100)] )
return documents_list,titles
# Generate a file list & load the data from it
file_list = dir_recursive(path)
documents_list, titles = load_data2(file_list)

python How do I import multiple .txt files in a folder to add characters to each .txt file?

There are text files of various names in the folder 'a'. I want to read all of these text files and add the letter 'b' to each text file. What should I do?
cwd = os.getcwd()
input_dir = os.path.join(cwd, "my .txt files dir")
sorts = sorted(glob(input_dir), key = lambda x:(len(x) , x))
for f in sorts :
f = open(input_dir, 'a')
data = "add text"
f.write(data)
f.close()
Append data to file:
- first: get all file in folder a.
- second: find extension with .txt.
- third: open it and do something('append', or 'rewrite').
Demo:
import os
# your .txt files dir
path = 'a'
# append data what you want
appendData = 'b'
fileNames = list(os.walk(path))[0][2]
fileNames.sort(key=len)
fileNums = len(fileNames)
# your dst file extension
fileExt = '.txt'
# # Extract extension from filename
# fileExt = os.path.splitext(fileNames[0])[1]
for fileName in fileNames:
if fileName.endswith(fileExt):
fileFullPath = os.path.join(path, fileName)
with open(fileFullPath, 'a') as f:
f.write(appendData)
Like the others said, this is an easy question that could easily be find on google. Anyway here's how to do it:
from os import listdir
from os.path import isfile, isdir, join
files = [file for file in listdir("files") if isfile(join("files", file))]
directories = [directory for directory in listdir("files") if isdir(join("files", directory))]
print(files)
for file_name in files:
try:
file = open("files/" + file_name, "a")
file.write("b")
file.close()
except IOError as err:
print("Could not open file because : ", err)
Replace "file" with the directory where your files are or the path to that directory like "directory0/directory1/directory_with_files"
Avoid to open files with
f = open(input_dir, 'a')
f.close()
Instead
with open(input_dir, 'a') as inputFile:
Do something
Also what you want is
import os
import glob # We will use this module to open only .txt files
path = 'your/path'
for filename in glob.glob(os.path.join(path, '*.txt'))
with open(filename, 'a') as inputFile:
inputFile.write('b')

I have a ".txt "file which consists of various filenames and I want to search each filename in a folder where these files are actually kept

Suppose I have a text file aiq_hits.txt.
Each line in this file corresponds a filename
ant1.aiq
ant2.aiq
ant3.aiq
ant4.aiq
I want to match each line of my textfile (ant1.aiq,ant2.aiq and so on) with filenames which are present at some specific place(R:\Sample) and extract matching files into some other place (R:\sample\wsa).
I have an idea that I need to use functions like os.walk() and fnmatch.fnmatch(), shutil.copy() but I am not able to implement them
My code:
import os
import shutil
import fnmatch
with open("aiq_hits.txt","r") as in_file:
for line in in_file:
I am stuck here
import os
import shutil
sourceDir = "R:\\Sample"
targetDir = "R:\\Sample\\wsa"
existingFiles = set(f for f in os.listdir(sourceDir) if os.path.isfile(os.path.join(sourceDir, f)))
infilepath = "aiq_hits.txt"
with open(infilepath) as infile:
for line in infile:
fname = line.strip()
if fname not in existingFiles: continue
shutil.move(os.path.join(sourceDir, fname), os.path.join(targetDir, fname))
I hope this will suffice:
import os
def match_files(url,file_read, dest):
f = open(file_read, 'rb')
file_list = os.listdir(url)
print(file_list)
saved_path = os.getcwd()
print("Current working directory is " + saved_path)
os.chdir(url)
match = []
for file_name in f:
file_name = file_name.strip()
if file_name in file_list:
match.append(file_name)
os.rename(os.path.join(url, file_name), os.path.join(dest, file_name))
os.chdir(saved_path)
print match
here, url is source directory or folder from which u want to match files, file_read is the name of file (with path) in which list of file names is given, dest is the destination folder.
this code moves the matching files from url to dest, i.e. these files won't remin in url after running the code.
Alternatively you could use the glob module which allows you to enter in a expression for the file name\extension which will then return a list that you can loop over.
I'd use this module if the source directory can have files with the same extension that you want to exclude from being looped over
Also I'm assuming that the file name list is not large and so storing it in a list wont be an issue
eg (I haven't tested the below )
from glob import glob
import os
import shutil
src = 'R:\\Sample'
dst = "R:\\Sample\\wsa"
in_file_list = "aiq_hits.txt"
list_Of_files = glob(os.path.join(src, 'ant*.aiq'))
data = []
with open(in_file_list) as reader:
data += reader.readlines()
for row in list_Of_files:
file_path, file_name = os.path.split(row)
if file_name in data:
shutil.copy2(row, os.path.join(dst, file_name))
# or if you want to move the file
# shutil.move(row, os.path.join(dst, file_name))

Get rows from all .txt files in directory using python

I have some txt files in a directory and I need to get the last 15 lines from all of them. How could I do it using python?
I chose this code:
from os import listdir
from os.path import isfile, join
dir_path= './'
files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
out = []
for file in files:
filedata = open(join(dir_path, file), "r").readlines()[-15:]
out.append(filedata)
f = open(r'./fin.txt','w')
f.writelines(out)
f.close()
but I get the error "TypeError: writelines() argument must be a sequence of strings". I think it's because of Russian letters in the lines.
import os
from collections import deque
for filename in os.listdir('/some/path'):
# might want to put a check it's actually a file here...
# (join it to a root path, or anything else....)
# and sanity check it's text of a usable kind
with open(filename) as fin:
last_15 = deque(fin, 15)
deque will automatically discard the oldest entry and peak the max size to be 15, so it's an efficient way of keeping just the "last" 'n' items.
Try this:
from os import listdir
from os.path import isfile
for filepath in listdir("/path/to/folder")
if isfile(filepath): # if need
last_five_lines = open(filepath).readlines()[-15:]
# or, one line:
x = [open(f).readlines()[-15:] for f in listdir("/path/to/folder") if isfile(f)]
Updated:
lastlines = []
for file in files:
lastlines += open(join(dir_path, file), "r").readlines()[-15:]
with open('./fin.txt', 'w') as f:
f.writelines(lastlines)
from os import listdir
from os.path import isfile, join
dir_path= '/usr/lib/something'
files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
for file in files:
filedata = open(join(dir_path, file), "r").readlines()[-15:]
#do something with the filedata
Hope this helps:
import os
current_dir = os.getcwd()
dir_objects = os.listdir(current_dir)
dict_of_last_15 = {}
for file in dir_objects:
file_obj = open(file, 'rb')
content = file_obj.readlines()
last_15_lines = content[-15:]
dict_of_last_15[file] = last_15_lines
print "#############: %s" % file
print dict_of_last_15[file]
file_to_check.close()

Categories

Resources