Find files in a directory containing desired string in Python

Find files in a directory containing desired string in Python - python

I'm trying to find a string in files contained within a directory. I have a string like banana that I know that exists in a few of the files.
import os
import sys
user_input = input("What is the name of you directory?")
directory = os.listdir(user_input)
searchString = input("What word are you trying to find?")
for fname in directory: # change directory as needed
if searchString in fname:
f = open(fname,'r')
print('found string in file %s') %fname
else:
print('string not found')
When the program runs, it just outputs string not found for every file. There are three files that contain the word banana, so the program isn't working as it should. Why isn't it finding the string in the files?

You are trying to search for string in filename, use open(filename, 'r').read():
import os
user_input = input('What is the name of your directory')
directory = os.listdir(user_input)
searchstring = input('What word are you trying to find?')
for fname in directory:
if os.path.isfile(user_input + os.sep + fname):
# Full path
f = open(user_input + os.sep + fname, 'r')
if searchstring in f.read():
print('found string in file %s' % fname)
else:
print('string not found')
f.close()
We use user_input + os.sep + fname to get full path.
os.listdir gives files and directories names, so we use os.path.isfile to check for files.

Here is another version using the Path module from pathlib instead of os.
def search_in_file(path,searchstring):
with open(path, 'r') as file:
if searchstring in file.read():
print(f' found string in file {path.name}')
else:
print('string not found')
from pathlib import Path
user_input = input('What is the name of your directory')
searchstring = input('What word are you trying to find?')
dir_content = sorted(Path(user_input).iterdir())
for path in dir_content:
if not path.is_dir():
search_in_file(path, searchstring)

This is my solution for the problem. It comes with the feature of also checking in sub-directories, as well as being able to handle multiple file types. It is also quite easy to add support for other ones. The downside is of course that it's quite chunky code. But let me know what you think.
import os
import docx2txt
from pptx import Presentation
import pdfplumber
def findFiles(strings, dir, subDirs, fileContent, fileExtensions):
# Finds all the files in 'dir' that contain one string from 'strings'.
# Additional parameters:
# 'subDirs': True/False : Look in sub-directories of your folder
# 'fileContent': True/False :Also look for the strings in the file content of every file
# 'fileExtensions': True/False : Look for a specific file extension -> 'fileContent' is ignored
filesInDir = []
foundFiles = []
filesFound = 0
if not subDirs:
for filename in os.listdir(dir):
if os.path.isfile(os.path.join(dir, filename).replace("\\", "/")):
filesInDir.append(os.path.join(dir, filename).replace("\\", "/"))
else:
for root, subdirs, files in os.walk(dir):
for f in files:
if not os.path.isdir(os.path.join(root, f).replace("\\", "/")):
filesInDir.append(os.path.join(root, f).replace("\\", "/"))
print(filesInDir)
# Find files that contain the keyword
if filesInDir:
for file in filesInDir:
print("Current file: "+file)
# Define what is to be searched in
filename, extension = os.path.splitext(file)
if fileExtensions:
fileText = extension
else:
fileText = os.path.basename(filename).lower()
if fileContent:
fileText += getFileContent(file).lower()
# Check for translations
for string in strings:
print(string)
if string in fileText:
foundFiles.append(file)
filesFound += 1
break
return foundFiles
def getFileContent(filename):
'''Returns the content of a file of a supported type (list: supportedTypes)'''
if filename.partition(".")[2] in supportedTypes:
if filename.endswith(".pdf"):
content = ""
with pdfplumber.open(filename) as pdf:
for x in range(0, len(pdf.pages)):
page = pdf.pages[x]
content = content + page.extract_text()
return content
elif filename.endswith(".txt"):
with open(filename, 'r') as f:
content = ""
lines = f.readlines()
for x in lines:
content = content + x
f.close()
return content
elif filename.endswith(".docx"):
content = docx2txt.process(filename)
return content
elif filename.endswith(".pptx"):
content = ""
prs = Presentation(filename)
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
content = content+shape.text
return content
else:
return ""
supportedTypes = ["txt", "docx", "pdf", "pptx"]
print(findFiles(strings=["buch"], dir="C:/Users/User/Desktop/", subDirs=True, fileContent=True, fileExtensions=False))

Here is the most simple answer I can give you. You don't need the colors, they are just cool and you may find that you can learn more than one thing in my code :)
import os
from time import sleep
#The colours of the things
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
# Ask the user to enter string to search
search_path = input("Enter directory path to search : ")
file_type = input("File Type : ")
search_str = input("Enter the search string : ")
# Append a directory separator if not already present
if not (search_path.endswith("/") or search_path.endswith("\\") ):
search_path = search_path + "/"
# If path does not exist, set search path to current directory
if not os.path.exists(search_path):
search_path ="."
# Repeat for each file in the directory
for fname in os.listdir(path=search_path):
# Apply file type filter
if fname.endswith(file_type):
# Open file for reading
fo = open(search_path + fname, 'r')
# Read the first line from the file
line = fo.read()
# Initialize counter for line number
line_no = 1
# Loop until EOF
if line != '' :
# Search for string in line
index = line.find(search_str)
if ( index != -1) :
print(bcolors.OKGREEN + '[+]' + bcolors.ENDC + ' ', fname, sep="")
print(' ')
sleep(0.01)
else:
print(bcolors.FAIL + '[-]' + bcolors.ENDC + ' ', fname, ' ', 'does not contain', ' ', search_str, sep="")
print(" ")
sleep(0.01)
line = fo.readline()
# Increment line counter
line_no += 1
# Close the files
fo.close()
That is it!

I was trying with the following code for this kind of problem, please have a look.
import os,sys
search_path=input("Put the directory here:")
search_str = input("Enter your string")
# Append a directory separator if not already present
if not (search_path.endswith("/") or search_path.endswith("\\") ):
search_path = search_path + "/"
# If path does not exist, set search path to current directory
if not os.path.exists(search_path):
search_path ="."
# Repeat for each file in the directory
for fname in os.listdir(path=search_path):
# Apply file type filter
if fname.endswith(file_type):
# Open file for reading
fo = open(search_path + fname)
# Read the first line from the file
line = fo.readline()
# Initialize counter for line number
line_no = 1
# Loop until EOF
while line != '' :
# Search for string in line
index = line.find(search_str)
if ( index != -1) :
print(fname, "[", line_no, ",", index, "] ", line, sep="")
# Read next line
line = fo.readline()
# Increment line counter
line_no += 1
# Close the files
fo.close()

Related

why the python file handling not recognizing file name?

I want to replace a string with another string in a file. I have below program to perform the task.
import os
import sys
import traceback
from glob import iglob
def usage():
print('Usage: python FindAndReplace.py [Old String] [New String] '
'[File Filters(default:".txt,.xml")] [Directory To Check(.)]')
def search_replace_string(fileName, old_str, new_str):
if not(os.path.isfile(fileName) and os.access(fileName, os.W_OK)):
print("Warning: Skipping..File does not exist or and is not writeable:" + filename)
return False
fileupdated = False
# Read the old file
with open(fileName, 'r') as f:
newlines = []
for lines in f.readlines():
if old_str in lines:
fileupdated = True
line = lines.replace(old_str, new_str)
newlines.append(line)
# Write changes to same file
if fileupdated:
print("string Found and Updating File: " + fileName)
try:
with open(fileName, 'w') as f:
for line in newlines:
f.write(line)
except:
print("Error: Cannot open/access existing file for writing: " + fileName)
return fileupdated
def main():
try:
DEFAULT_PATH = iglob(str('<path_to_file.xml'))
if len(sys.argv) < 3:
usage()
# old/new string required parameters, exit if not supplied
sys.exit(-1)
else:
oldString = sys.argv[1]
newString = sys.argv[2]
if len(sys.argv) < 4:
patterns = ['.xml', '.txt']
else:
stringFilter = sys.argv[3]
patterns = stringFilter.split(',')
if len(sys.argv) < 5:
path = DEFAULT_PATH
else:
path = sys.argv[4]
print('[Old String] :' + oldString)
print('[New String] :' + newString)
print('[File Filters] :' + ', '.join(patterns))
print('[Directory To Check] :' + path)
if not os.path.exists(path):
raise Exception("Selected path does not exist: " + path)
# Walk through directory structure looking for files matching patterns
matchingFileList = [os.path.join(dp, f) for dp, dn, filenames in os.walk(path) for f in filenames if os.path.splitext(f)[1] in patterns]
print('Files found matching patterns: ' + str(len(matchingFileList)))
filecount = 0
filesReplaced = 0
for currFile in matchingFileList:
filecount += 1
filesReplaced = search_replace_string(currFile, old_str, new_str)
if filesReplaced:
filesReplaced += 1
print("Total Files Searched :" + str(filecount))
print("Total Files Replaced/Updated :" + str(filesReplaced))
except Exception as err:
print(traceback.format_exception_only(type(err), err)[0].rstrip())
sys.exit(-1)
if __name__ == '__main__':
main()
When I am executing it from a command line I am getting below error:
(null): can't open file 'uro.py': [Errno 2] No such file or directory
Below is the command line argument I am giving
python uro.py <file_path> <old_str> <new_str>
NOTE: I am using xml file.
I wanted to develop a logic that will take file_name, old and new string as a command line argument. In the error one can see that the program is considering the python file a input file. While it should take the path of the file that I give with CMD argument.
What's the mistake here? Please suggest. Thank you

I got it fixed, there was some path issue. Thank you.

How to use a CFS_Config textfile to create path directories to auto generate text files in python?

Below is the data in CFS_Config.txt. What this textfile does is to know where the documents have stored and to avoid hardcodes in the program. For instance, if the program is moved to other environment, we only need to change the directory paths in the CFS_Config.txt file.
Folder Path = ../dataprep/source_documents
ED Notes name = ED Notes
ED Notes output = ../dataprep/ED_Notes
This below codes in a python file what it actually does is to read configuration from the CFS_Config.txt mentioned earlier and also to do an auto generated textfile.
The problem encountered is that they tell me the ../dataprep/ED_Notes path was not found. Please do take a look at the codes if need more codes I will try my best to provide, thanks!!! :((
from preprocessing import ednotes_extractor
import os
def read_config():
# open existing file to read configuration
cfs_config_txt = open("..\CFS_Config.txt", "r")
file_list = []
root_dir = ""
ednotes_name = ""
ednotes_output = ""
for line in cfs_config_txt:
file_list.append(line)
if "Folder Path = " in file_list[0]:
root_dir = str(file_list[0])
root_dir = root_dir.replace("Folder Path = ", "")
root_dir = root_dir.replace("\n", "")
if "ED Notes name = " in file_list[1]:
ednotes_name = str(file_list[1])
ednotes_name = ednotes_name.replace("ED Notes name = ", "")
ednotes_name = ednotes_name.replace("\n", "")
if "ED Notes output = " in file_list[2]:
ednotes_output = str(file_list[2])
ednotes_output = ednotes_output.replace("ED Notes output = ", "")
ednotes_output = ednotes_output + ".txt"
ednotes_output = ednotes_output.replace("\n", "")
return root_dir, ednotes_name, ednotes_output
def convert_txt(choices):
root_dir, ednotes_name, ednotes_output = read_config()
if(choices == 1):
# open new file to write string data textfile
text_file = open(ednotes_output, 'w', encoding='utf-8')
text_file.write("cat_id|content\n")
for filename in os.listdir(root_dir):
source_directory = root_dir + '/' + filename
arr = ednotes_extractor.get_ednotes(source_directory)
# open existing file to append the items in the array to the previously written textfile
text_file = open(ednotes_output, 'a', encoding='utf-8')
for item in arr:
text_file.write("%s\n" % item)
elif(choices==2):
print("Hi")

Read all files from folder and edit

I am trying to read all fasta files from test folder and put the name of file in all headers of individual file. The code working for first file and dont proceed to second file and return error. Could you help me find bug in my code or edit it. Thanks
import sys, glob, os, string
header = ''
check = 0
path = "./test/"
dirs = os.listdir(path)
for file in dirs:
fp = open(file, "r")
fpx = open('%s_output.txt' % file, 'w')
for line in fp:
if line.startswith('>'):
line = line.rstrip()
check = check + 1
if check >= 1:
header = line
fpx.write(header + '_' + file + '\n')
else:
line = line.rstrip()
fpx.write(line + '\n')

It would be good to provide the error message you are getting! I think this must fail with "File not found" because you try to open the file by name instead of path. Try fp = open(os.path.join(path, file), "r"):
import sys, glob, os, string
header = ''
check = 0
path = "./test/"
dirs = os.listdir(path)
for file in dirs:
fp = open(os.path.join(path, file), "r")
fpx = open('%s_output.txt' % file, 'w')
for line in fp:
if line.startswith('>'):
line = line.rstrip()
check = check + 1
if check >= 1:
header = line
fpx.write(header + '_' + file + '\n')
else:
line = line.rstrip()
fpx.write(line + '\n')

epub3 : how to add the mimetype at first in archive

I'm working on a script to create epub from html files, but when I check my epub I have the following error : Mimetype entry missing or not the first in archive
The Mimetype is present, but it's not the first file in the epub. Any idea how to put it in first place in any case using Python ?

Sorry, I don't have the time right now to give a detailed explanation, but here's a (relatively) simple epub processing program I wrote a while ago that shows how to do that.
epubpad.py
#! /usr/bin/env python
''' Pad the the ends of paragraph lines in an epub file with a single space char
Written by PM 2Ring 2013.05.12
'''
import sys, re, zipfile
def bold(s): return "\x1b[1m%s\x1b[0m" % s
def report(attr, val):
print "%s '%s'" % (bold(attr + ':'), val)
def fixepub(oldname, newname):
oldz = zipfile.ZipFile(oldname, 'r')
nlist = oldz.namelist()
#print '\n'.join(nlist) + '\n'
if nlist[0] != 'mimetype':
print bold('Warning!!!'), "First file is '%s', not 'mimetype" % nlist[0]
#get the name of the contents file from the container
container = 'META-INF/container.xml'
# container should be in nlist
s = oldz.read(container)
p = re.compile(r'full-path="(.*?)"')
a = p.search(s)
contents = a.group(1)
#report("Contents file", contents)
i = contents.find('/')
if i>=0:
dirname = contents[:i+1]
else:
#No directory separator in contents name!
dirname = ''
report("dirname", dirname)
s = oldz.read(contents)
#print s
p = re.compile(r'<dc:creator.*>(.*)</dc:creator>')
a = p.search(s)
creator = a.group(1)
report("Creator", creator)
p = re.compile(r'<dc:title>(.*)</dc:title>')
a = p.search(s)
title = a.group(1)
report("Title", title)
#Find the names of all xhtml & html text files
p = re.compile(r'\.[x]?htm[l]?')
htmnames = [i for i in nlist if p.search(i) and i.find('wrap')==-1]
#Pattern for end of lines that don't need padding
eolp = re.compile(r'[>}]$')
newz = zipfile.ZipFile(newname, 'w', zipfile.ZIP_DEFLATED)
for fname in nlist:
print fname,
s = oldz.read(fname)
if fname == 'mimetype':
f = open(fname, 'w')
f.write(s)
f.close()
newz.write(fname, fname, zipfile.ZIP_STORED)
print ' * stored'
continue
if fname in htmnames:
print ' * text',
#Pad lines that are (hopefully) inside paragraphs...
newlines = []
for line in s.splitlines():
if len(line)==0 or eolp.search(line):
newlines.append(line)
else:
newlines.append(line + ' ')
s = '\n'.join(newlines)
newz.writestr(fname, s)
print
newz.close()
oldz.close()
def main():
oldname = len(sys.argv) > 1 and sys.argv[1]
if not oldname:
print 'No filename given!'
raise SystemExit
newname = len(sys.argv) > 2 and sys.argv[2]
if not newname:
if oldname.rfind('.') == -1:
newname = oldname + '_P'
else:
newname = oldname.replace('.epub', '_P.epub')
newname = newname.replace(' ', '_')
print "Processing '%s' to '%s' ..." % (oldname, newname)
fixepub(oldname, newname)
if __name__ == '__main__':
main()
FWIW, I wrote this program to process files for my simple e-reader that annoyingly joins paragraphs together if they don't end with white space.

The solution I've found:
delete the previous mimetype file
when creating the new archive create an new mimetype file before adding anything else : zipFile.writestr("mimetype", "application/epub+zip")
Why does it work : the mimetype is the same for all epub : "application/epub+zip", no need to use the original file.

Python - extract and modify part of a specific line of text with a function for all files in folder

I'm looking to extract and modify a specific line of text in many files within a folder but I am having some trouble.
For instance, the first file might read:
To: Bob
From: Bill
<Message> The eagle flies at midnight. <End Message>
The second message is different, but same format, and so on. I'd like to extract the third line, pass 'The eagle flies at midnight.' through a function (like base64), and then put it back on the line between 'Message' and 'End Message'. Such that the final output would read:
To: Bob
From: Bill
<Message> VGhlIGVhZ2xlIGZsaWVzIGF0IG1pZG5pZ2h0Lg== <End Message>
This is what I am trying (and adjusting) so far.
import base64
import os
import io
#ask user where his stuff is / is going
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder:")
#get that stuff
myfilepath = os.path.join(directory, '*.txt')
with open('*.txt', 'r') as file:
data = file.readlines()
#Go to line 3 and take out non encoded text.
data[3] = X
X.strip("<Message>")
X.strip("<End Message>")
coded_string = X
#Encode line 3
base64.b64encode(coded_string)
data[3] = '<Message> %s <End Message>' % (coded_string)
# and write everything back
with open('*.txt', 'w') as file:
file.writelines(data)
I'm sure there are numerous problems, particularly with how I am opening and writing back. Bonus points: 99% of the messages in this folder are in this exact format, but there are 1% junk messages (they dont need to be encoded, and line 3 for them is something different). I'm not too worried about them, but if they could be unharmed in the process that'd be nifty. Maybe line 3 should be line 2 if the count starts at 0 ...
Edit: Trying
import re, base64
import os
folder = 'C:/Users/xxx/Desktop/input'
matcher = re.compile("<Message>(?P<text>[^<]*)<End Message>")
for filename in os.listdir(folder):
infilename = os.path.join(folder, filename)
if not os.path.isfile(infilename): continue
base, extension = os.path.splitext(filename)
filein = open(infilename, 'r')
fileout = open(os.path.join(folder, '{}_edit.{}'.format(base, extension)), 'w')
for line in filein:
match = matcher.search(line)
if match:
fileout.write("<message> " + base64.b64encode(match.group('text').strip()) + " <End message>\n")
else:
fileout.write(line)
filein.close()
fileout.close()
Ultimately this gives me a bunch of blank files except for the last one which is translated properly.

You can use regular expression to make it easier as:
import re, base64
filein = open("examplein.txt", 'r')
fileout = open("exampleout.txt", 'w')
matcher = re.compile("<Message>(?P<text>[^<]*)<End Message>")
for line in filein:
match = matcher.search(line)
if match:
fileout.write("<message> " + base64.b64encode(match.group('text').strip()) + " <End message>\n")
else:
fileout.write(line)
filein.close()
fileout.close()
This code works just for one file, you should adapt it to work with all the file in you directory:
import re, base64
import os
folder = '/home/user/Public'
matcher = re.compile("<Message>(?P<text>[^<]*)<End Message>")
for filename in os.listdir(folder):
infilename = os.path.join(folder, filename)
if not os.path.isfile(infilename): continue
base, extension = os.path.splitext(filename)
filein = open(infilename, 'r')
fileout = open(os.path.join(folder, '{}_edit.{}'.format(base, extension)), 'w')
for line in filein:
match = matcher.search(line)
if match:
fileout.write("<message> " + base64.b64encode(match.group('text').strip()) + " <End message>\n")
else:
fileout.write(line)
filein.close()
fileout.close()
This code works in my pc

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Find files in a directory containing desired string in Python - python

Related

why the python file handling not recognizing file name?

How to use a CFS_Config textfile to create path directories to auto generate text files in python?

Read all files from folder and edit

epub3 : how to add the mimetype at first in archive

Python - extract and modify part of a specific line of text with a function for all files in folder

Categories

Resources