python code creates file with tilde in title - python

The following code works as intended to create a new file (or files) using the file name (or names) in the directory. However it also creates a file with a tilde in the title and a duplicate of the word2find. Here is an example result
~$17 Q3 Xcel retire retire.txt
I don't understand why this is happening. Any explanations will be appreciated. Cheers, BobS
import os, os.path,re, pprint,sys, nltk
sourcedir = 'C:/Users/Public/EnvDef/Proj/1 ErnCls/2 IOUErnClsQrtr/2017 Q3/23a Xcel'
os.chdir (sourcedir)
cwd = os.getcwd()
print ('New Current Working Directory %s' % cwd)
for dirPath, subdirNames, fileList in os.walk(cwd):
for filename in fileList:
with open (filename,'r', encoding="ascii", errors ="surrogateescape") as fin:
FileinA=(fin.read())
print (' ')
#create basename by deleting the file name suffix
basename=os.path.splitext(filename)[0]
print (basename)
#tokenize file
FileinB=nltk.sent_tokenize(FileinA)
word2find = 'retire'
result = [sentence for sentence in FileinB if word2find in sentence]
with open (basename+' '+ word2find +'.txt', 'w', encoding="ascii", errors="surrogateescape") as filetowrite:
filetowrite.write(basename+ ' Retire' + "\n" +"\n")
for line in result:
filetowrite.write(line + "\n")

Related

Regular expression in python to filter the files based on match of string

I have over 100 files in a folder, the file names are as follows -
XY1234_2345.1.txt
XY1234_2345.2.txt
and so on
XY1234_4567.1.txt
XY1234_4567.2.txt
and so on
I want to list these files and output them in txt file as filenames:
XY1234_2345.1.txt, XY1234_2345.2.txt, and so on
XY1234_4567.1.txt, XY1234_4567.2.txt, and so on
how can I achieve the desired results using the Python.
You can use Python pathlib:
from pathlib import Path
my_directory = Path(r"path/to/my/directories")
my_file = Path(r"path/to/my/file")
with open(my_file, 'w+') as f:
prefix = None
for file in sorted(my_directory.iterdir()):
file_prefix, _ = file.name.split('_')
if file_prefix == prefix:
f.write(", " + file.name)
else:
f.write("\n" + file.name)
prefix = file_prefix
This will write each filename inside my_directory onto a separate line in the file at my_file.

Patch Movement: File Backup and Replace

I'm getting an error while trying to copy files from a single source directory which contains bug fixes: /home/saurabh/testbed/patch_dir/ to multiple destination directories: app_dir_1 and app_dir_2.
Both these directories are exact replicas of each other.
The script does the following:-
Read lines from a text file into a list. Each line contains the name of one component. In this case: ['file1.class', file2.html]
Search value at each index recursively, starting from a particular directory:
/home/saurabh/testbed/dest_dir/
Take a backup of these files wherever they are found by appending ddMonyyyy to their extension.
Copy files from directory which contains patched components: /home/saurabh/testbed/patch_dir/
to the directory where backup was taken earlier
Directory Overview:-
/home/saurabh/testbed/dest_dir/
|--app_dir_1
|--file1.class
|--file2.html
|--file3.jsp
|--file4.xml
|--sub_dir
|--app_dir_2
|--file1.class
|--file2.html
|--file3.jsp
|--file4.xml
|--sub_dir
|--other_directories
/home/saurabh/testbed/patch_dir/
|--file1.class
|--file2.html
Below is my code:
#!/usr/bin/env python
import os
import fnmatch
import datetime
import shutil
with open('filenames.txt') as f:
content = f.readlines()
content = [x.strip() for x in content]
print('File contents:')
print(content)
suffix = datetime.datetime.now().strftime("_%d%b%Y")
approot = '/home/saurabh/testbed/dest_dir/'
source_dir = '/home/saurabh/testbed/patch_dir/'
dir_list = []
print('\n' + 'Renaming files present at:')
for root, dirs, files in os.walk(approot):
for file_list in content:
for filename in fnmatch.filter(files, file_list):
print(os.path.join(root, filename))
dir_list.append(root)
current_file = os.path.join(root, filename)
backup_file = os.path.join(root, filename + suffix)
os.rename(current_file, backup_file)
print("\n" + "Backup of all files complete!")
print('Backup of ' + str(len(dir_list)) + ' files taken recursively')
print('Number of files mentioned in text file: ' + str(len(content)) + '\n')
# 2 instances in UAT
# 12 instances in PROD
if (2*len(content)) == len(dir_list):
print("Retrofitted components will be copied to their respective directories")
for dst_ind in range(0, len(dir_list)):
if filename in fnmatch.filter(files, file_list):
print(source_dir + content[dst_ind] + "\t" + dir_list[dst_ind])
#shutil.copy2(source_dir+content[dst_ind], dir_list[dst_ind])
I'm getting the below error while copying the files (4.)
File contents:
['file1.class', 'file2.html']
Renaming files present at:
/home/saurabh/testbed/dest_dir/app_dir_1/file1.class
/home/saurabh/testbed/dest_dir/app_dir_1/file2.html
/home/saurabh/testbed/dest_dir/app_dir_2/file1.class
/home/saurabh/testbed/dest_dir/app_dir_2/file2.html
Backup of all files complete!
Backup of 4 files taken recursively
Number of files mentioned in text file: 2
Retrofitted components will be copied to their respective directories
/home/saurabh/testbed/patch_dir/file1.class /home/saurabh/testbed/dest_dir/app_dir_1
/home/saurabh/testbed/patch_dir/file2.html /home/saurabh/testbed/dest_dir/app_dir_1
Traceback (most recent call last):
File "./prod_movement.py", line 56, in <module>
print(source_dir + content[dst_ind] + "\t" + dir_list[dst_ind])
IndexError: list index out of range
Expected Output:
File contents:
['file1.class', 'file2.html']
Renaming files present at:
/home/saurabh/testbed/dest_dir/app_dir_1/file1.class
/home/saurabh/testbed/dest_dir/app_dir_1/file2.html
/home/saurabh/testbed/dest_dir/app_dir_2/file1.class
/home/saurabh/testbed/dest_dir/app_dir_2/file2.html
Backup of all files complete!
Backup of 4 files taken recursively
Number of files mentioned in text file: 2
Retrofitted components will be copied to their respective directories
/home/saurabh/testbed/patch_dir/file1.class /home/saurabh/testbed/dest_dir/app_dir_1
/home/saurabh/testbed/patch_dir/file2.html /home/saurabh/testbed/dest_dir/app_dir_1
/home/saurabh/testbed/patch_dir/file1.class /home/saurabh/testbed/dest_dir/app_dir_2
/home/saurabh/testbed/patch_dir/file2.html /home/saurabh/testbed/dest_dir/app_dir_2
Appreciate any help to fix the code.
Figured it out !!
#!/usr/bin/env python
import os
import fnmatch
import datetime
import shutil
with open('filenames.txt') as f:
content = f.readlines()
content = [x.strip() for x in content]
print('File contents:')
print(content)
suffix = datetime.datetime.now().strftime("_%d%b%Y")
approot = '/Users/saurabhm/Desktop/Python/testbed/dest_dir/'
source_dir = '/Users/saurabhm/Desktop/Python/testbed/patch_dir/'
dir_list = []
print('\n' + 'Renaming files present at:')
for root, dirs, files in os.walk(approot):
for file_list in content:
for filename in fnmatch.filter(files, file_list):
print(os.path.join(root, filename))
dir_list.append(root)
current_file = os.path.join(root, filename)
backup_file = os.path.join(root, filename + suffix)
#print(current_file, backup_file)
os.rename(current_file, backup_file)
#print(source_dir + filename + "\t" + root)
shutil.copy2(source_dir + filename, root)
os.chmod(root + '/' + filename, 0o750)
print("\n" + "Backup of all files complete!")
print('Backup of ' + str(len(dir_list)) + ' files taken recursively')
print('Number of files mentioned in text file: ' + str(len(content)) + '\n')
Once patched components are received over email, they are copied to a directory in the respective server. All these files are consolidated in a single directory (patch_dir)
Backup of existing files (having the same name and are present in "dest_dir") are taken wherever they are found, following which each file is copied from "patch_dir" to the directories inside "dest_dir", where their backup was taken.

Find files in a directory containing desired string in Python

I'm trying to find a string in files contained within a directory. I have a string like banana that I know that exists in a few of the files.
import os
import sys
user_input = input("What is the name of you directory?")
directory = os.listdir(user_input)
searchString = input("What word are you trying to find?")
for fname in directory: # change directory as needed
if searchString in fname:
f = open(fname,'r')
print('found string in file %s') %fname
else:
print('string not found')
When the program runs, it just outputs string not found for every file. There are three files that contain the word banana, so the program isn't working as it should. Why isn't it finding the string in the files?
You are trying to search for string in filename, use open(filename, 'r').read():
import os
user_input = input('What is the name of your directory')
directory = os.listdir(user_input)
searchstring = input('What word are you trying to find?')
for fname in directory:
if os.path.isfile(user_input + os.sep + fname):
# Full path
f = open(user_input + os.sep + fname, 'r')
if searchstring in f.read():
print('found string in file %s' % fname)
else:
print('string not found')
f.close()
We use user_input + os.sep + fname to get full path.
os.listdir gives files and directories names, so we use os.path.isfile to check for files.
Here is another version using the Path module from pathlib instead of os.
def search_in_file(path,searchstring):
with open(path, 'r') as file:
if searchstring in file.read():
print(f' found string in file {path.name}')
else:
print('string not found')
from pathlib import Path
user_input = input('What is the name of your directory')
searchstring = input('What word are you trying to find?')
dir_content = sorted(Path(user_input).iterdir())
for path in dir_content:
if not path.is_dir():
search_in_file(path, searchstring)
This is my solution for the problem. It comes with the feature of also checking in sub-directories, as well as being able to handle multiple file types. It is also quite easy to add support for other ones. The downside is of course that it's quite chunky code. But let me know what you think.
import os
import docx2txt
from pptx import Presentation
import pdfplumber
def findFiles(strings, dir, subDirs, fileContent, fileExtensions):
# Finds all the files in 'dir' that contain one string from 'strings'.
# Additional parameters:
# 'subDirs': True/False : Look in sub-directories of your folder
# 'fileContent': True/False :Also look for the strings in the file content of every file
# 'fileExtensions': True/False : Look for a specific file extension -> 'fileContent' is ignored
filesInDir = []
foundFiles = []
filesFound = 0
if not subDirs:
for filename in os.listdir(dir):
if os.path.isfile(os.path.join(dir, filename).replace("\\", "/")):
filesInDir.append(os.path.join(dir, filename).replace("\\", "/"))
else:
for root, subdirs, files in os.walk(dir):
for f in files:
if not os.path.isdir(os.path.join(root, f).replace("\\", "/")):
filesInDir.append(os.path.join(root, f).replace("\\", "/"))
print(filesInDir)
# Find files that contain the keyword
if filesInDir:
for file in filesInDir:
print("Current file: "+file)
# Define what is to be searched in
filename, extension = os.path.splitext(file)
if fileExtensions:
fileText = extension
else:
fileText = os.path.basename(filename).lower()
if fileContent:
fileText += getFileContent(file).lower()
# Check for translations
for string in strings:
print(string)
if string in fileText:
foundFiles.append(file)
filesFound += 1
break
return foundFiles
def getFileContent(filename):
'''Returns the content of a file of a supported type (list: supportedTypes)'''
if filename.partition(".")[2] in supportedTypes:
if filename.endswith(".pdf"):
content = ""
with pdfplumber.open(filename) as pdf:
for x in range(0, len(pdf.pages)):
page = pdf.pages[x]
content = content + page.extract_text()
return content
elif filename.endswith(".txt"):
with open(filename, 'r') as f:
content = ""
lines = f.readlines()
for x in lines:
content = content + x
f.close()
return content
elif filename.endswith(".docx"):
content = docx2txt.process(filename)
return content
elif filename.endswith(".pptx"):
content = ""
prs = Presentation(filename)
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
content = content+shape.text
return content
else:
return ""
supportedTypes = ["txt", "docx", "pdf", "pptx"]
print(findFiles(strings=["buch"], dir="C:/Users/User/Desktop/", subDirs=True, fileContent=True, fileExtensions=False))
Here is the most simple answer I can give you. You don't need the colors, they are just cool and you may find that you can learn more than one thing in my code :)
import os
from time import sleep
#The colours of the things
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
# Ask the user to enter string to search
search_path = input("Enter directory path to search : ")
file_type = input("File Type : ")
search_str = input("Enter the search string : ")
# Append a directory separator if not already present
if not (search_path.endswith("/") or search_path.endswith("\\") ):
search_path = search_path + "/"
# If path does not exist, set search path to current directory
if not os.path.exists(search_path):
search_path ="."
# Repeat for each file in the directory
for fname in os.listdir(path=search_path):
# Apply file type filter
if fname.endswith(file_type):
# Open file for reading
fo = open(search_path + fname, 'r')
# Read the first line from the file
line = fo.read()
# Initialize counter for line number
line_no = 1
# Loop until EOF
if line != '' :
# Search for string in line
index = line.find(search_str)
if ( index != -1) :
print(bcolors.OKGREEN + '[+]' + bcolors.ENDC + ' ', fname, sep="")
print(' ')
sleep(0.01)
else:
print(bcolors.FAIL + '[-]' + bcolors.ENDC + ' ', fname, ' ', 'does not contain', ' ', search_str, sep="")
print(" ")
sleep(0.01)
line = fo.readline()
# Increment line counter
line_no += 1
# Close the files
fo.close()
That is it!
I was trying with the following code for this kind of problem, please have a look.
import os,sys
search_path=input("Put the directory here:")
search_str = input("Enter your string")
# Append a directory separator if not already present
if not (search_path.endswith("/") or search_path.endswith("\\") ):
search_path = search_path + "/"
# If path does not exist, set search path to current directory
if not os.path.exists(search_path):
search_path ="."
# Repeat for each file in the directory
for fname in os.listdir(path=search_path):
# Apply file type filter
if fname.endswith(file_type):
# Open file for reading
fo = open(search_path + fname)
# Read the first line from the file
line = fo.readline()
# Initialize counter for line number
line_no = 1
# Loop until EOF
while line != '' :
# Search for string in line
index = line.find(search_str)
if ( index != -1) :
print(fname, "[", line_no, ",", index, "] ", line, sep="")
# Read next line
line = fo.readline()
# Increment line counter
line_no += 1
# Close the files
fo.close()

Python - extract and modify part of a specific line of text with a function for all files in folder

I'm looking to extract and modify a specific line of text in many files within a folder but I am having some trouble.
For instance, the first file might read:
To: Bob
From: Bill
<Message> The eagle flies at midnight. <End Message>
The second message is different, but same format, and so on. I'd like to extract the third line, pass 'The eagle flies at midnight.' through a function (like base64), and then put it back on the line between 'Message' and 'End Message'. Such that the final output would read:
To: Bob
From: Bill
<Message> VGhlIGVhZ2xlIGZsaWVzIGF0IG1pZG5pZ2h0Lg== <End Message>
This is what I am trying (and adjusting) so far.
import base64
import os
import io
#ask user where his stuff is / is going
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder:")
#get that stuff
myfilepath = os.path.join(directory, '*.txt')
with open('*.txt', 'r') as file:
data = file.readlines()
#Go to line 3 and take out non encoded text.
data[3] = X
X.strip("<Message>")
X.strip("<End Message>")
coded_string = X
#Encode line 3
base64.b64encode(coded_string)
data[3] = '<Message> %s <End Message>' % (coded_string)
# and write everything back
with open('*.txt', 'w') as file:
file.writelines(data)
I'm sure there are numerous problems, particularly with how I am opening and writing back. Bonus points: 99% of the messages in this folder are in this exact format, but there are 1% junk messages (they dont need to be encoded, and line 3 for them is something different). I'm not too worried about them, but if they could be unharmed in the process that'd be nifty. Maybe line 3 should be line 2 if the count starts at 0 ...
Edit: Trying
import re, base64
import os
folder = 'C:/Users/xxx/Desktop/input'
matcher = re.compile("<Message>(?P<text>[^<]*)<End Message>")
for filename in os.listdir(folder):
infilename = os.path.join(folder, filename)
if not os.path.isfile(infilename): continue
base, extension = os.path.splitext(filename)
filein = open(infilename, 'r')
fileout = open(os.path.join(folder, '{}_edit.{}'.format(base, extension)), 'w')
for line in filein:
match = matcher.search(line)
if match:
fileout.write("<message> " + base64.b64encode(match.group('text').strip()) + " <End message>\n")
else:
fileout.write(line)
filein.close()
fileout.close()
Ultimately this gives me a bunch of blank files except for the last one which is translated properly.
You can use regular expression to make it easier as:
import re, base64
filein = open("examplein.txt", 'r')
fileout = open("exampleout.txt", 'w')
matcher = re.compile("<Message>(?P<text>[^<]*)<End Message>")
for line in filein:
match = matcher.search(line)
if match:
fileout.write("<message> " + base64.b64encode(match.group('text').strip()) + " <End message>\n")
else:
fileout.write(line)
filein.close()
fileout.close()
This code works just for one file, you should adapt it to work with all the file in you directory:
import re, base64
import os
folder = '/home/user/Public'
matcher = re.compile("<Message>(?P<text>[^<]*)<End Message>")
for filename in os.listdir(folder):
infilename = os.path.join(folder, filename)
if not os.path.isfile(infilename): continue
base, extension = os.path.splitext(filename)
filein = open(infilename, 'r')
fileout = open(os.path.join(folder, '{}_edit.{}'.format(base, extension)), 'w')
for line in filein:
match = matcher.search(line)
if match:
fileout.write("<message> " + base64.b64encode(match.group('text').strip()) + " <End message>\n")
else:
fileout.write(line)
filein.close()
fileout.close()
This code works in my pc

How to run multiple files using python

I am trying to run multiple text files from folder on my desktop from which i have to search the word olympic and if it find olympic in that 50 text files than it should save it in output.txt , like
textfile1 = 2
textfile2 = 1 and so on upto texfile=50
import glob
import re
write_file = open("output.txt")
flist = glob.glob("./*.py") # adjust glob pattern as desired
print flist
print " lines : path to file"
for fpath in flist:
with open(fpath) as f: #open files
lines = f.readlines()
if (lines == 'olympic'):
write_file.write("%s" % lines) #write the results
print "%6d : %s" % (len(lines),fpath)
#with open securely opens and closes the file
write_file.close() # close file
This is what i am trying to do , but yes i know its full of error :)
i am trying to run multiple files , but not manually , i want that it automatically run the files of whole directory/folder and save their output in one text file..'I have 50 text files and all files have word olympic , some have 1 some have 2/3 etc , i want to count the words from each text file and than save their output in one text file , like textfile1 = 2 , textfile2 = 3 etc in output.txt
Try this
import os
import re
filelist = [file for file in os.listdir('.') if '.py' in file]
for file in filelist:
f = open(file,"r")
lines = f.readlines()
sum=0
sum1=0
for line in lines:
if "olympics" in line:
len(re.findall('\Wolympics\W', line))
sum=sum+1
print sum
elif "Olympics" in line:
sum1=sum1+1
print sum1
print "%6d : %s" % (len(line),file.name)
f.close()
Not really sure what you're attempting to do, but I gave it a shot...
something like that:
fetch.py
#! /usr/bin/env python
import os
import sys
def get_counts(filepath, niddle):
with open(filepath, 'rb') as f:
return f.read().count(niddle)
if __name__ == '__main__':
folder = sys.argv[1]
niddle = sys.argv[2]
assert os.path.isdir(folder), "Not a folder"
output = open('results.txt', 'w')
for dirpath, dirnames, filenames in os.walk(folder):
for filename in filenames:
if not filename.endswith('.html'): # or whatever
continue
filepath = os.path.abspath('%s/%s' % (dirpath, filename))
result = '%s = %d' % (filepath, get_counts(filepath, niddle))
output.write(result)
output.close()
use as:
$ python fetch.py /path/to/search olimpic

Categories

Resources