So , I am having two files , so to checks its validity I am performing try and except two times . But I don't thinks this is a good method, can you suggest a better way?
Here is my code:
def form_density_dictionary(self,word_file,fp_exclude):
self.freq_dictionary={}
try:
with open(fp_exclude,'r')as fp2:
words_excluded=fp2.read().split() #words to be excluded stored in a list
print("**Read file successfully :" + fp_exclude + "**")
words_excluded=[words.lower() for words in words_excluded] # converted to lowercase
except IOError:
print("**Could not read file:", fp_exclude, " :Please check file name**")
sys.exit()
try:
with open(word_file,'r') as file:
print("**Read file successfully :" + word_file + "**")
words_list=file.read()
if not words_list:
print("**No data in file:",word_file +":**")
sys.exit()
words_list=words_list.split()
words_list=[words.lower() for words in words_list] # lowercasing entire list
unique_words=list((set(words_list)-set(words_excluded)))
self.freq_dictionary= {word:("%6.2f"%(float((words_list.count(word))/len(words_list))*100)) for word in unique_words}
#print((len(self.freq_dictionary)))
except IOError:
print("**Could not read file:", word_file, " :Please check file name**")
sys.exit()
Any other suggestion is also welcomed to make it more pythonic.
The first thing that jumps out is the lack of consistency and readability: in some lines you indent with 4 spaces, on others you only use two; in some places you put a space after a comma, in others you don't, in most places you don't have spaces around the assignment operator (=)...
Be consistent and make your code readable. The most commonly used formatting is to use four spaces for indenting and to always have a space after a comma but even more important than that is to be consistent, meaning that whatever you choose, stick with it throughout your code. It makes it much easier to read for everyone, including yourself.
Here are a few other things I think you could improve:
Have a single exception handling block instead of two.
You can also open both files in a single line.
Even better, combine both previous suggestions and have a separate method to read data from the files, thus eliminating code repetition and making the main method easier to read.
For string formatting it's preferred to use .format() instead of %. Check this out: https://pyformat.info/
Overall try to avoid repetition in your code. If there's something you're doing more than once, extract it to a separate function or method and use that instead.
Here's your code quickly modified to how I'd probably write it, and taking these things into account:
import sys
class AtifImam:
def __init__(self):
self.freq_dictionary = {}
def form_density_dictionary(self, word_file, exclude_file):
words_excluded = self.read_words_list(exclude_file)
words_excluded = self.lowercase(words_excluded)
words_list = self.read_words_list(word_file)
if len(words_list) == 0:
print("** No data in file: {} **".format(word_file))
sys.exit()
words_list = self.lowercase(words_list)
unique_words = list((set(words_list) - set(words_excluded)))
self.freq_dictionary = {
word: ("{:6.2f}".format(
float((words_list.count(word)) / len(words_list)) * 100))
for word in unique_words
}
#staticmethod
def read_words_list(file_name):
try:
with open(file_name, 'r') as file:
data = file.read()
print("** Read file successfully: {} **".format(file_name))
return data.split()
except IOError as e:
print("** Could not read file: {0.filename} **".format(e))
sys.exit()
#staticmethod
def lowercase(word_list):
return [word.lower() for word in word_list]
Exceptions thrown that involve a file system path have a filename attribute that can be used instead of explicit attributes word_file and fp_exclude as you do.
This means you can wrap these IO operations in the same try-except and use the exception_instance.filename which will indicate in which file the operation couldn't be performed.
For example:
try:
with open('unknown_file1.py') as f1, open('known_file.py') as f2:
f1.read()
f2.read()
except IOError as e:
print("No such file: {0.filename}".format(e))
Eventually prints out:
No such file: unknown_file1.py
While the opposite:
try:
with open('known_file.py') as f1, open('unknown_file2.py') as f2:
f1.read()
f2.read()
except IOError as e:
print("No such file: {0.filename}".format(e))
Prints out:
No such file: unknown_file2.py
To be more 'pythonic' you could use something what is callec Counter, from collections library.
from collections import Counter
def form_density_dictionary(self, word_file, fp_exclude):
success_msg = '*Read file succesfully : {filename}'
fail_msg = '**Could not read file: {filename}: Please check filename'
empty_file_msg = '*No data in file :{filename}:**'
exclude_read = self._file_open(fp_exclude, success_msg, fail_msg, '')
exclude = Counter([word.lower() for word in exclude_read.split()])
word_file_read = self._file_open(word_file, success_msg, fail_msg, empty_file_msg)
words = Counter([word.lower() for word in word_file_read.split()])
unique_words = words - excluded
self.freq_dictionary = {word: '{.2f}'.format(count / len(unique_words))
for word, count in unique_words.items()}
Also it would be better if you would just create the open_file method, like:
def _open_file(self, filename, success_msg, fails_msg, empty_file_msg):
try:
with open(filename, 'r') as file:
if success_msg:
print(success_msg.format(filename= filename))
data = file.read()
if empty_file_msg:
print(empty_file_msg.format(filename= filename))
return data
except IOError:
if fail_msg:
print(fail_msg.format(filename= filename))
sys.exit()
Related
I'm trying to do something. I want to open multiple files and count the words in it for example, but I want to know how many of files couldn't be open.
Its what I tried:
i = 0
def word_count(file_name):
try:
with open(file_name) as f:
content = f.read()
except FileNotFoundError:
pass
i = 0
i += 1
else:
words = content.split()
word_count = len(words)
print(f'file {file_name} has {word_count} words.')
file_name = ['data1.txt','a.txt','data2w.txt','b.txt','data3w.txt','data4w.txt']
for names in file_name:
word_count(names)
print(len(file_name) - i , 'files weren\'t found')
print (i)
So, I get this error:
runfile('D:/~/my')
file data1.txt has 13 words.
file data2w.txt has 24 words.
file data3w.txt has 21 words.
file data4w.txt has 108 words.
Traceback (most recent call last):
File "D:\~\my\readtrydeffunc.py", line 27, in <module>
print(len(file_name) - i , 'files weren\'t found')
NameError: name 'i' is not defined
I tried something else also, but I think I don't understand the meaning of scopes well. I think its because i is assigned out of except scope, but when I assign i = 0 in except scope, I can't print it at the end, because it will be destroyed after execution.
Yes, you're on the right track. You need to define and increment i outside the function, or pass the value through the function, increment, and return the new value. Defining i outside the function is more common, and more Pythonic.
def count_words(file_name):
with open(file_name) as f:
content = f.read()
words = content.split()
word_count = len(words)
#print(f'file {file_name} has {word_count} words.')
return word_count
file_name = ['data1.txt','a.txt','data2w.txt','b.txt','data3w.txt','data4w.txt']
i = 0
for names in file_name:
try:
result = count_words(names)
except FileNotFoundError:
i += 1
print(i, 'files weren\'t found')
I would recommend breaking this into 2 functions; One to handle the word counting and a second to control the flow of the script. The control one should handle any errors that arise as well as handle and the feedback from said errors.
def word_count(file_name):
with open(file_name) as f:
content = f.read()
words = content.split()
word_count = len(words)
print(f'file {file_name} has {word_count} words.')
def file_parser(files):
i = 0
for file in files:
try:
word_count(file)
except FileNotFoundError:
i+=1
if i > 0:
print(f'{i} files were not found')
file_names = ['data1.txt','a.txt','data2w.txt','b.txt','data3w.txt','data4w.txt']
file_parser(file_names)
While refactoring your code to not use global variables should be the preferred approach (see edit for a possible refactoring), the minimal modification to get your code running is to remove pass and i = 0 within the except clause, and ask i to be used globally inside your function:
def word_count(file_name):
global i # use a `i` variable defined globally
try:
with open(file_name) as f:
content = f.read()
except FileNotFoundError:
i += 1 # increment `i` when the file is not found
else:
words = content.split()
word_count = len(words)
print(f'file {file_name} has {word_count} words.')
i = 0
file_name = ['data1.txt','a.txt','data2w.txt','b.txt','data3w.txt','data4w.txt']
for names in file_name:
word_count(names)
print(i, 'files weren\'t found')
Note that i will contain the number of files not found.
EDIT
A reasonably refactored code could look something like:
def word_count(filepath):
result = 0
with open(filepath) as file_obj:
for line in file_obj:
result += len(line.split())
return result
def process_files(filepaths):
result = {}
num_missing = 0
for filepath in filepaths:
try:
num_words = word_count(filepath)
except FileNotFoundError:
num_missing += 1
else:
result[filepath] = num_words
return result, num_missing
filenames = [
'data1.txt', 'a.txt', 'data2w.txt', 'b.txt', 'data3w.txt', 'data4w.txt']
wordcounts, num_missing = process_files(filenames)
for filepath, num_words in wordcounts.items():
print(f'File {filepath} has {num_words} words.')
print(f'{i} files weren\'t found')
Notes:
the word_count() function now only does one thing: word counting. This is done on a line by line basis to better handle potentially long files, which could fill the memory if loaded at once.
the process_files() function extract the essential information and stores them in a dict
all the printing of the results is done in one place, and could be easily wrapped up in a main() function.
num_missing (formerly i, circa) is now a local variable.
Finally note that while explicitly counting the number of exception is one way, the other being just getting this information by subtracting the number of elements in result from the number of input filepaths.
This could be done anywhere, there is no need to do this in process_files().
I am new to the world of python/or programming in general.
I have a folder which consist of two .txt files. I want to read the files and create a data structure to store all unique words in those files. This what I have written,
import glob
import errno
path = '/path/to/my/files/*.txt'
files = glob.glob(path)
for name in files:
try:
with open(name, encoding="ISO-8859-1") as f:
f.read()
except IOError as exc:
if exc.errno != errno.EISDIR:
raise
But I dont know how to modify the program to find the unique words. I would appreciate if you could guide me. Thank you.
you may do this:
import glob
import errno
path = '/path/to/my/files/*.txt'
files = glob.glob(path)
unique = dict()
for name in files:
try:
with open(name, encoding="ISO-8859-1") as f:
data = f.read()
for word in data.split(' '):
if word.strip():
unique[word] = word
except IOError as exc:
if exc.errno != errno.EISDIR:
raise
print unique.keys()
[Edited] Changed dictionary to set.
Use a set to save the words.
I recommend you to create a function that reads a file and then use it in your for.
For example:
term_list = set()
def unique_words(path+"filename.txt"):
text = open(path+"filename.txt","r")
for line in text:
if line != '\n':
line = line.strip().split(' ')
for word in line:
term_list.add(word)
return
try adding 'encoding="latin-1"' to the open function. So
with open(name, encoding="latin-1") as f:
I've never used Python and have copied some script (with permission) from someone online, so I'm not sure why the code is dropping. I'm hoping someone can understand it and put it right for me!
from os import walk
from os.path import join
#First some options here.
!RootDir = "C:\\Users\\***\\Documents\\GoGames"
!OutputFile = "C:\\Users\\***\\Documents\\GoGames\\protable.csv"
Properties = !!['pb', 'pw', 'br', 'wr', 'dt', 'ev', 're']
print """
SGF Database Maker
==================
Use this program to create a CSV file with sgf info.
"""
def getInfo(filename):
"""Read out file info here and return a dictionary with all the
properties needed."""
result = !![]
file = open(filename, 'r')
data = file.read(1024) read at most 1kb since we assume all relevant info is in the beginning
file.close()
for prop in Properties:
try:
i = data.lower().index(prop)
except !ValueError:
result.append((prop, ''))
continue
try:
value = data![data.index('![', i)+1 : data.index(']', i)]
except !ValueError:
value = ''
result.append((prop, value))
return dict(result)
!ProgressCounter = 0
file = open(!OutputFile, "w")
file.write('^Filename^;^PB^;^BR^;^PW^;^WR^;^RE^;^EV^;^DT^\n')
for root, dirs, files in walk(!RootDir):
for name in files:
if name![-3:].lower() != "sgf":
continue
info = getInfo(join(root, name))
file.write('^'+join(root, name)+'^;^'+info!['pb']+'^;^'+info!['br']+'^;^'+info!['pw']+'^;^'+info!['wr']+'^;^'+info!['re']+'^;^'+info!['ev']+'^;^'+info!['dt']+'^\n')
!ProgressCounter += 1
if (!ProgressCounter) % 100 == 0:
print str(!ProgressCounter) + " games processed."
file.close()
print "A total of " + str(!ProgressCounter) + " have been processed."
Using Netbeans IDE I get the following error:
!RootDir = "C:\\Users\\***\\Documents\\GoGames"
^
SyntaxError: mismatched input '' expecting EOF
I have previously been able to step through the code as far as file.close(), where I go an error "does not match outer indentation level".
Anyone able to put the syntax of this code right for me?
Remove the exclamation marks in front of variable names, list declarations (!![]) and in except clauses (except !ValueError), this is not valid Python syntax.
Id like to read a file for a specific match in the following style "word = word", specifically Im looking to find files with usernames and passwords in them. These files would be scripts created by admins using bad practices with clear credentials being used in logonscripts etc.
The code I have created so far does the job but its very messy and prints an entire line if the match is found (I cant help but think there is a more elegant way to do this). This creates ugly output, id like to print only the match in the line. I cant seem to find a way to do that. If I can create the correct regex for a match of something like the below match, is it possible to only print the match found in the line rather than the entire line?
(I am going to try describe the type of match im looking for)
Key
* = wildcard
- = space
^ = anycharacter until a space
Match
*(U|u)ser^-=-^
dirt = "/dir/path/"
def get_files():
for root, dirs, files in os.walk(dirt):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
readfile = open(os.path.join(root, filename), "r")
for line in readfile:
if re.match("(.*)(U|u)ser(.*)", line) and re.match("(.*)(=)(.*)", line) or re.match("(.*)(P|p)ass(.*)", line) and re.match("(.*)(=)(.*)", line):
print line
TEST SCRIPT
strComputer = "atl-ws-01"
strNamespace = “root\cimv2”
strUser = "Administrator"
strPassword = "4rTGh2#1"
user = AnotherUser #Test
pass = AnotherPass #test
Set objWbemLocator = CreateObject("WbemScripting.SWbemLocator")
Set objWMIService = objwbemLocator.ConnectServer _
(strComputer, strNamespace, strUser, strPassword)
objWMIService.Security_.authenticationLevel = WbemAuthenticationLevelPktPrivacy
Set colItems = objWMIService.ExecQuery _
("Select * From Win32_OperatingSystem")
For Each objItem in ColItems
Wscript.Echo strComputer & ": " & objItem.Caption
Next
Latest Code after taking on bored the responses
This is the latest code I am using. It seems to be doing the job as expected, apart from the output isnt managed as well as Id like. Id like to add the items into a dictionary. Key being the file name. And two vaules, the username and password. Although this will be added as a separate question.
Thanks all for the help
dirt = "~/Desktop/tmp"
def get_files():
regs = ["(.*)((U|u)ser(.*))(\s=\s\W\w+\W)", "(.*)((U|u)ser(.*))(\s=\s\w+)", "(.*)((P|p)ass(.*))\s=\s(\W(.*)\W)", "(.*)((P|p)ass(.*))(\s=\s\W\w+\W)"]
combined = "(" + ")|(".join(regs) + ")"
results = dict()
for root, dirs, files in os.walk(dirt):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
readfile = open(os.path.join(root, filename), "r")
for line in readfile:
m = re.match(combined, line)
if m:
print os.path.join(root, filename)
print m.group(0)
Latest Code output
~/Desktop/tmp/Domain.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript1.vbs
strUser = "guytom"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts /Logon/logonscript1.vbs
strPassword = "P#ssw0rd1"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript2.bat
strUsername = "guytom2"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript2.bat
strPass = "SECRETPASSWORD"
https://docs.python.org/2/library/re.html
group([group1, ...])
Returns one or more subgroups of the match. If there is a single argument, the result is a single string; if there are multiple arguments, the result is a tuple with one item per argument. Without arguments, group1 defaults to zero (the whole match is returned). If a groupN argument is zero, the corresponding return value is the entire matching string;
match.group(0)
Since you can have many object=value you need to use regular expressions. Here is some sample code for you.
line1 = " someuser = bob "
line2 = " bob'spasswd= secretpassword"
#re.I will do case insensitive search
userMatchObj=re.search('.*user.*=\\s*([\\S]*).*', line1, re.I)
pwdMatchObj=re.search(r'.*pass.*=\s*(.*)', line2, re.I)
if userMatchObj: print "user="+userMatchObj.group(1)
if pwdMatchObj: print "password="+pwdMatchObj.group(1)
output:
user=bob
password=secretpassword
References: https://docs.python.org/2/library/re.html , http://www.tutorialspoint.com/python/python_reg_expressions.htm
Thanks all for the help. Below is my working code (needs further work on the output but the matching is working well)
dirt = "~/Desktop/tmp"
def get_files():
regs = ["(.*)((U|u)ser(.*))(\s=\s\W\w+\W)", "(.*)((U|u)ser(.*))(\s=\s\w+)", "(.*)((P|p)ass(.*))\s=\s(\W(.*)\W)", "(.*)((P|p)ass(.*))(\s=\s\W\w+\W)"]
combined = "(" + ")|(".join(regs) + ")"
results = dict()
for root, dirs, files in os.walk(dirt):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
readfile = open(os.path.join(root, filename), "r")
for line in readfile:
m = re.match(combined, line)
if m:
print os.path.join(root, filename)
print m.group(0)
Latest Code output
~/Desktop/tmp/Domain.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript1.vbs
strUser = "guytom"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript1.vbs
strPassword = "P#ssw0rd1"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript2.bat
strUsername = "guytom2"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript2.bat
strPass = "SECRETPASSWORD"
I have written a script on a python "icecast server", and I changed some strings in "/etc/icecast2/icecast.xml" like this:
import os,sys,re
def ices2():
changedir=open(pathh + "icecast3.xml", "w")
data=open("/etc/icecast2/icecast.xml").read()
changedir.write(re.sub("<source-password>hackme</source-password>","<source-password>123</source-password>" % x,data))
changedir.close()
ices2()
def ices1():
changedir1=open(pathh + "icecast2.xml", "w")
data=open(pathh + "icecast3.xml").read()
changedir1.write(re.sub("<relay-password>hackme</relay-password>", "<relay-password>123</relay-password>" % x,data))
changedir1.close()
os.remove(pathh + "icecast3.xml")
ices1()
def ices():
changedir2=open("/etc/icecast2/icecast.xml", "w")
data=open(pathh + "icecast2.xml").read()
changedir2.write(re.sub("<admin-password>hackme</admin-password>","<admin-password>123</admin-password>" % x,data))
changedir2.close()
os.remove(pathh + "icecast2.xml")
ices()
...but it's too long for the script. How can I shorten it? I need to do some changes in one file, open it to make changes and close it without any lost data. I know that it can be done in one function, but how to do it I don't know.
I need three changes in one function like this:
def ices():
changedir=open(pathh + "icecast3.xml", "w")
data=open("/etc/icecast2/icecast.xml").read()
changedir.write(re.sub("<source-password>hackme</source-password>","<source-password>123</source-password>",data))
changedir1.write(re.sub("<relay-password>hackme</relay-password>", "<relay-password>123</relay-password>",data))
changedir2.write(re.sub("<admin-password>hackme</admin-password>","<admin-password>123</admin-password>",data))
changedir.close()
i did it in one function and my script short than upper one. But it's wrong i need do it correctly
changedir=open(pathh + "icecast3.xml", "w")
data=open("/etc/icecast2/icecast.xml").read()
Here I create a new file "pathh + "icecast3.xml" (pathh-/home/user/Downloads), but I need to open file:
"/etc/icecast2/icecast.xml"
...read it and write changes to the same file.
All three functions do the same so you can join them into one. This is not complete solution but I think that you could go on from here on your own:
import os,sys,re
def ices(in_path, out_path, remove=False):
changedir = open(out_path, "w")
data = open(in_path, 'r')
changedir.write(re.sub("<source-password>hackme</source-password>","<source-password>123</source-password>" % x,data.read())) # this is wrong as well but I take it as an example
changedir.close()
data.close()
if remove:
os.remove(in_path)
You can call this function with:
ices(base_path + 'icecast2.xml', base_path + 'icecast3.xml', True)
Hints:
it's better to use os.path.join for creating the full paths (as opposed to string concatenation)
look at with statement and cosider using it for increased readability
EDIT (respecting the clarification in comment):
Sorry I missed the different strings in write. You can do it simply like this:
f = open(filename, 'r')
data = f.read()
f.close()
for tag in ['source', 'relay', 'admin']
sub_str = "<{tag_name}>%s</{tag_name}>".format(tag_name=tag+'-password')
data = re.sub(sub_str % 'hackme', sub_str % '123', data)
f = open(filename+'.new', 'w')
f.write(data)
f.close()