Read all files from folder and edit - python

I am trying to read all fasta files from test folder and put the name of file in all headers of individual file. The code working for first file and dont proceed to second file and return error. Could you help me find bug in my code or edit it. Thanks
import sys, glob, os, string
header = ''
check = 0
path = "./test/"
dirs = os.listdir(path)
for file in dirs:
fp = open(file, "r")
fpx = open('%s_output.txt' % file, 'w')
for line in fp:
if line.startswith('>'):
line = line.rstrip()
check = check + 1
if check >= 1:
header = line
fpx.write(header + '_' + file + '\n')
else:
line = line.rstrip()
fpx.write(line + '\n')

It would be good to provide the error message you are getting! I think this must fail with "File not found" because you try to open the file by name instead of path. Try fp = open(os.path.join(path, file), "r"):
import sys, glob, os, string
header = ''
check = 0
path = "./test/"
dirs = os.listdir(path)
for file in dirs:
fp = open(os.path.join(path, file), "r")
fpx = open('%s_output.txt' % file, 'w')
for line in fp:
if line.startswith('>'):
line = line.rstrip()
check = check + 1
if check >= 1:
header = line
fpx.write(header + '_' + file + '\n')
else:
line = line.rstrip()
fpx.write(line + '\n')

Related

python - open and write all files in dir

I have the following script that does multiple operations for .txt file (splits by tag, removes line breaks, sentence per line & blank lines in between).
I know how to do this for a specific file:
import re
with open("input.txt", "r") as f:
i = 1
w = None
for line in f:
line = line.strip()
if line:
if w is not None and line == "</div>":
w.close()
i += 1
w = None
else:
if w is None:
w = open('output_%i.txt' % i, 'w')
for s in re.split("(?<=[.!:;?])\s+", line):
w.write(s + '\n\n')
How can I apply this to all .txt files in a dir?
I cannot figure out how to open and write to all files in a dir.
I thought this would work:
import os
import re
path = "/Users/simon/dic/en-new-oxford/output"
for filename in os.listdir(path):
with open(os.path.join(path, filename), 'r') as f:
i = 1
w = None
for line in f:
line = line.strip()
if line:
if w is not None and line == "</d:entry>":
w.close()
i += 1
w = None
else:
if w is None:
w = open(os.path.join('path, filename_%i.txt') % i, 'w')
for s in re.split("(?<=[.!:;?])\s+", line):
w.write(s + '\n\n')
What is wrong about this?
This does work. You probably had some other directory in your directory messing up your code.Also checks if the file ends with '.txt' .
from os import listdir
from os.path import isfile, join
path = ''
# Get all Files in directory , not other directory's.
allFiles = [join(path, file) for file in listdir(path) if isfile(join(path, file)) and file.endswith('.txt')]
for file in allFiles:
with open(file) as f:
"do stuff here"

Getting UnicodeDecodeError

I getting this weird UnicodeDecodeError and
I don't know why this error is caused but it would be really nice if someone could help me out with this issue:)
Error message:
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 6456:character maps to
Full Error message as an screenshot
screenshot of the Error message
My code:
import os
import json
import random
import csv
from pydub import AudioSegment
file_path = '/path/to/file/.tsv '
save_json_path = '/path/where/you/want/the/jsons/saved'
def main(args):
data = []
directory = file_path.rpartition('/')[0]
percent = int(100)
with open(file_path) as f:
lenght = sum(1 for ine in f)
with open(file_path, newline='') as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t')
index = 1
if(args.convert):
print(str(lenght) + "files found")
for row in reader:
file_name = row['path']
filename = file_name.rpartition('.')[0] + ".wav"
text = row['sentence']
if(args.convert):
data.append({
"key": directory + "/clips/" + filename,
"text": text
})
print("converting file " + str(index) + "/" + str(lenght) + " to wav", end="\r")
src = directory + "/clips/" + file_name
dst = directory + "/clips/" + filename
sound = AudioSegment.from_mp3(src)
sound.export(dst, format="wav")
index = index + 1
else:
data.append({
"key": directory + "/clips/" + file_name,
"text": text
})
random.shuffle(data)
print("creating JSON's")
f = open(save_json_path +"/"+ "train.json", "w")
with open(save_json_path +"/"+ 'train.json','w') as f:
d = len(data)
i=0
while(i<int(d-d/percent)):
r=data[i]
line = json.dumps(r)
f.write(line + "\n")
i = i+1
f = open(save_json_path +"/"+ "test.json", "w")
with open(save_json_path +"/"+ 'test.json','w') as f:
d = len(data)
i=int(d-d/percent)
while(i<d):
r=data[i]
line = json.dumps(r)
f.write(line + "\n")
i = i+1
print("Done!")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="""
Utility script to convert commonvoice into wav and create the training and test json files for speechrecognition. """
)
parser.add_argument('--convert', default=True, action='store_true',
help='says that the script should convert mp3 to wav')
args = parser.parse_known_args()
main(args)
It looks like you're getting this error in this block
with open(file_path) as f:
length = sum(1 for line in f)
In another post, though it doesn't have an accepted answer, this is shown to likely be because of the encoding of your file.
Try adding the encoding kwarg to open
with open(file_path, encoding="latin-1") as f:
length = sum(1 for line in f)

IOError: [Errno 2] No such file or directory: (Python: I am having this error even though the file does exist)

in the error output message I have the .json file in a streamed tweets folder. The data exists im not sure what is wrong with my path
import json
import os, os.path
# counter keeps track of the unique tweet ids and is used later when
# prepending the ElasticSearch compatibility
counter = 1
# find all files in folderPath, does not include subdirectories or directories
folderPath = r"c:/Users/Katherine/Downloads/Final Project-.zip/Final
Project/122_finalproject_part2/streamed_tweets";
files = next(os.walk(folderPath))[2]
# filecount is keeps track of how many files are in the streamedtweets folder
filecount = len(files) - 1
print(filecount)
ctr = 0
for ctr in range(filecount):
inFileName = open('streamedtweets/tweet_data_' + str(filecount) + '.json', 'r')
outFileName = open('elastic_data/elastictwitter_data_' + str(filecount) + '.json', 'w')
ERROR
File "add_elastic.py", line 16, in <module>
inFileName = open('streamedtweets/tweet_data_' + str(filecount) + '.json', 'r')
IOError: [Errno 2] No such file or directory: 'streamedtweets/tweet_data_32.json'
You expected tweet_data_ to be appended with 0, 1, 2, 3... but in actuality you're always appending 32 which doesn't seem to exist. A better way would be to loop directly through the files you have instead of guessing its name.
Try doing this instead:
folderPath = r"c:/Users/Katherine/Downloads/Final Project-.zip/Final
Project/122_finalproject_part2/streamed_tweets"
files = next(os.walk(folderPath))[2]
for i, file in enumerate(files):
if file.startswith('tweet_data_') and file.endswith('.json'):
inFileName = open(file, 'r')
outFileName = open('elastic_data/elastictwitter_data_' + str(i) + '.json', 'w')
And even better way would be to use the with statement to manage your files opening and closing:
for i, file in enumerate(files):
if file.startswith('tweet_data_') and file.endswith('.json'):
with open(file, 'r') as inFileName, open('elastic_data/elastictwitter_data_' + str(i) + '.json', 'w') as outFileName:
# do something with inFileName / outFileName

Find files in a directory containing desired string in Python

I'm trying to find a string in files contained within a directory. I have a string like banana that I know that exists in a few of the files.
import os
import sys
user_input = input("What is the name of you directory?")
directory = os.listdir(user_input)
searchString = input("What word are you trying to find?")
for fname in directory: # change directory as needed
if searchString in fname:
f = open(fname,'r')
print('found string in file %s') %fname
else:
print('string not found')
When the program runs, it just outputs string not found for every file. There are three files that contain the word banana, so the program isn't working as it should. Why isn't it finding the string in the files?
You are trying to search for string in filename, use open(filename, 'r').read():
import os
user_input = input('What is the name of your directory')
directory = os.listdir(user_input)
searchstring = input('What word are you trying to find?')
for fname in directory:
if os.path.isfile(user_input + os.sep + fname):
# Full path
f = open(user_input + os.sep + fname, 'r')
if searchstring in f.read():
print('found string in file %s' % fname)
else:
print('string not found')
f.close()
We use user_input + os.sep + fname to get full path.
os.listdir gives files and directories names, so we use os.path.isfile to check for files.
Here is another version using the Path module from pathlib instead of os.
def search_in_file(path,searchstring):
with open(path, 'r') as file:
if searchstring in file.read():
print(f' found string in file {path.name}')
else:
print('string not found')
from pathlib import Path
user_input = input('What is the name of your directory')
searchstring = input('What word are you trying to find?')
dir_content = sorted(Path(user_input).iterdir())
for path in dir_content:
if not path.is_dir():
search_in_file(path, searchstring)
This is my solution for the problem. It comes with the feature of also checking in sub-directories, as well as being able to handle multiple file types. It is also quite easy to add support for other ones. The downside is of course that it's quite chunky code. But let me know what you think.
import os
import docx2txt
from pptx import Presentation
import pdfplumber
def findFiles(strings, dir, subDirs, fileContent, fileExtensions):
# Finds all the files in 'dir' that contain one string from 'strings'.
# Additional parameters:
# 'subDirs': True/False : Look in sub-directories of your folder
# 'fileContent': True/False :Also look for the strings in the file content of every file
# 'fileExtensions': True/False : Look for a specific file extension -> 'fileContent' is ignored
filesInDir = []
foundFiles = []
filesFound = 0
if not subDirs:
for filename in os.listdir(dir):
if os.path.isfile(os.path.join(dir, filename).replace("\\", "/")):
filesInDir.append(os.path.join(dir, filename).replace("\\", "/"))
else:
for root, subdirs, files in os.walk(dir):
for f in files:
if not os.path.isdir(os.path.join(root, f).replace("\\", "/")):
filesInDir.append(os.path.join(root, f).replace("\\", "/"))
print(filesInDir)
# Find files that contain the keyword
if filesInDir:
for file in filesInDir:
print("Current file: "+file)
# Define what is to be searched in
filename, extension = os.path.splitext(file)
if fileExtensions:
fileText = extension
else:
fileText = os.path.basename(filename).lower()
if fileContent:
fileText += getFileContent(file).lower()
# Check for translations
for string in strings:
print(string)
if string in fileText:
foundFiles.append(file)
filesFound += 1
break
return foundFiles
def getFileContent(filename):
'''Returns the content of a file of a supported type (list: supportedTypes)'''
if filename.partition(".")[2] in supportedTypes:
if filename.endswith(".pdf"):
content = ""
with pdfplumber.open(filename) as pdf:
for x in range(0, len(pdf.pages)):
page = pdf.pages[x]
content = content + page.extract_text()
return content
elif filename.endswith(".txt"):
with open(filename, 'r') as f:
content = ""
lines = f.readlines()
for x in lines:
content = content + x
f.close()
return content
elif filename.endswith(".docx"):
content = docx2txt.process(filename)
return content
elif filename.endswith(".pptx"):
content = ""
prs = Presentation(filename)
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
content = content+shape.text
return content
else:
return ""
supportedTypes = ["txt", "docx", "pdf", "pptx"]
print(findFiles(strings=["buch"], dir="C:/Users/User/Desktop/", subDirs=True, fileContent=True, fileExtensions=False))
Here is the most simple answer I can give you. You don't need the colors, they are just cool and you may find that you can learn more than one thing in my code :)
import os
from time import sleep
#The colours of the things
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
# Ask the user to enter string to search
search_path = input("Enter directory path to search : ")
file_type = input("File Type : ")
search_str = input("Enter the search string : ")
# Append a directory separator if not already present
if not (search_path.endswith("/") or search_path.endswith("\\") ):
search_path = search_path + "/"
# If path does not exist, set search path to current directory
if not os.path.exists(search_path):
search_path ="."
# Repeat for each file in the directory
for fname in os.listdir(path=search_path):
# Apply file type filter
if fname.endswith(file_type):
# Open file for reading
fo = open(search_path + fname, 'r')
# Read the first line from the file
line = fo.read()
# Initialize counter for line number
line_no = 1
# Loop until EOF
if line != '' :
# Search for string in line
index = line.find(search_str)
if ( index != -1) :
print(bcolors.OKGREEN + '[+]' + bcolors.ENDC + ' ', fname, sep="")
print(' ')
sleep(0.01)
else:
print(bcolors.FAIL + '[-]' + bcolors.ENDC + ' ', fname, ' ', 'does not contain', ' ', search_str, sep="")
print(" ")
sleep(0.01)
line = fo.readline()
# Increment line counter
line_no += 1
# Close the files
fo.close()
That is it!
I was trying with the following code for this kind of problem, please have a look.
import os,sys
search_path=input("Put the directory here:")
search_str = input("Enter your string")
# Append a directory separator if not already present
if not (search_path.endswith("/") or search_path.endswith("\\") ):
search_path = search_path + "/"
# If path does not exist, set search path to current directory
if not os.path.exists(search_path):
search_path ="."
# Repeat for each file in the directory
for fname in os.listdir(path=search_path):
# Apply file type filter
if fname.endswith(file_type):
# Open file for reading
fo = open(search_path + fname)
# Read the first line from the file
line = fo.readline()
# Initialize counter for line number
line_no = 1
# Loop until EOF
while line != '' :
# Search for string in line
index = line.find(search_str)
if ( index != -1) :
print(fname, "[", line_no, ",", index, "] ", line, sep="")
# Read next line
line = fo.readline()
# Increment line counter
line_no += 1
# Close the files
fo.close()

Reading file from a directory in Python

I have a folder containing files on my desktop, and I'm trying to write a script that will read each of the file, replace the spaces with commas, and then return each file as a CSV file.
Here is my code but it is not working:
import os
import re
import csv
path = 'C:\Users\Kenny\Desktop\TTUM'
listing = os.listdir(path)
for infile in listing:
dir_item_path = os.path.join(path, infile)
fh = open(dir_item_path,'r')
for line in fh.readlines():
space_remove = re.sub(r"\s+",",",line.rstrip())
split_Line = space_remove.split(" ")
Fname = infile
Lname = Fname.split('.')[0]
name = Lname + ".csv"
process_file = open(name,"wb")
newfile = csv.writer(process_file)
newfile.writerow(split_Line)
process_file.close()
You are re-opening the file, writing a line, and closing every time. This will truncate the file and just write the single line. Try opening before the for loop (which you are already doing with the input file), and closing when everything is done.
path = 'C:\Users\Kenny\Desktop\TTUM'
listing = os.listdir(path)
for infile in listing:
dir_item_path = os.path.join(path, infile)
fh = open(dir_item_path,'r')
Fname = infile
Lname = Fname.split('.')[0]
name = Lname + ".csv"
process_file = open(name,"wb")
newfile = csv.writer(process_file)
for line in fh.readlines():
space_remove = re.sub(r"\s+",",",line.rstrip())
split_Line = space_remove.split(" ")
newfile.writerow(split_Line)
process_file.close()
Of course there may be wrong a lot more with your script, but for that you need to explain exactly what the problem is.

Categories

Resources