I'm very new to OOP and I have been trying to write a class I can import which will help me with parsing files. I realize I do not need to make a class to do this, but thought I'd try to so I can start getting familiar with OOP.
This code works
import re
import os
destdir = r"FilePathToDirectory"
class Parsey():
def GetNums(self,source, destination, trim = True):
with open (os.path.join(destdir,source), 'r') as infile:
with open (os.path.join(destdir,destination), 'w') as outfile:
for line in infile:
#Look for number patern match
if re.match(r'(.*)\d\d-\d\d-\d\d\d\d(.*)', line):
#If trim is True clean up the line
if trim == True:
#Find the first numeric character
firstdig = re.search("\d",line)
#Set the firstdig variable to the integer of that index
firstdig = firstdig.start()
#Set the line equal to only the begining and ending indexes of the number
line=line[firstdig:firstdig+10]
#Remove the dashes from the string
line = line.replace('-','')
outfile.writelines(line+'\n')
else:
outfile.writelines(line)
This code does not and I'm not sure why it doesn't.
import re
import os
class Parsey():
def __init__(self, destdir=''):
self.destdir = r"FilePathToDirectory"
def GetNums(self,source, destination, trim = True):
with open (os.path.join(destdir,source), 'r') as infile:
with open (os.path.join(destdir,destination), 'w') as outfile:
for line in infile:
#Look for number patern match
if re.match(r'(.*)\d\d-\d\d-\d\d\d\d(.*)', line):
#If trim is True clean up the line
if trim == True:
#Find the first numeric character
firstdig = re.search("\d",line)
#Set the firstdig variable to the integer of that index
firstdig = firstdig.start()
#Set the line equal to only the begining and ending indexes of the number
line=line[firstdig:firstdig+11]
#Remove the dashes from the string
line = line.replace('-','')
outfile.writelines(line+'\n')
else:
outfile.writelines(line)
I receive the error:
line 10, in GetNums
with open (os.path.join(destdir,source), 'r') as infile:
NameError: name 'destdir' is not defined
It was my understanding that the namespace of the class object would allow the functions within the class to see all variables declared in that class.
You need to change line 10 to:
with open (os.path.join(self.destdir, destination), 'w') as outfile:
In your case Python looks for testdir inside GetNums first and, if it cannot find it there, it will look for this name in the module. It does not magically use tesdir from __init__. The name self stands for the instance you will create later. So in __init__ you essentially set mysinstance.testdir and later in GetNums you can access with mysinstance.testdir. self is just the placeholder for mysinstance, i.e the instance you create later.
You can read the detail in the documentation.
#Mike Müller nailed it, but here is the corrected code in its entirety.
import re
import os
class Parsey():
def __init__(self, destdir=''):
self.destdir = r"FilePathToDirectory"
def GetNums(self,source, destination, trim = True):
with open (os.path.join(self.destdir,source), 'r') as infile:
with open (os.path.join(self.destdir,destination), 'w') as outfile:
for line in infile:
#Look for number patern match
if re.match(r'(.*)\d\d-\d\d-\d\d\d\d(.*)', line):
#If trim is True clean up the line
if trim == True:
#Find the first numeric character
firstdig = re.search("\d",line)
#Set the firstdig variable to the integer of that index
firstdig = firstdig.start()
#Set the line equal to only the begining and ending indexes of the number
line=line[firstdig:firstdig+10]
#Remove the dashes from the string
line = line.replace('-','')
outfile.writelines(line+'\n')
else:
outfile.writelines(line)
Related
I need to check if the .csv file I'm working with ends with more than 1 "\n" line. If it finds more than a blank line, it removes them all but one.
My code is:
import os
from pathlib import Path
def remove_blanks():
dirname = os.path.dirname(os.path.abspath(__file__))
path: Path = Path(os.path.join(dirname, "data.csv"))
with open(path, "r+") as op:
lines = op.readlines()
for line in lines:
if line == "\n":
op.write(line.rstrip("\n"))
The .csv file is something like ['01-01-2019,0,0,0\n', '18-05-2019,33,31,48\n', '\n', '\n', '\n'] and the output I'd want is ['01-01-2019,0,0,0\n', '18-05-2019,33,31,48\n', '\n'] but it doesn't seem to be able to delete any line.
The simplest way would be to keep track if you've seen an empty line, then write one just before you write a non-empty line.
pre = ""
for line in lines:
if line == "\n":
pre = line
else:
op.write(pre)
op.write(line)
pre = "\n"
op.write(pre)
This reduces any sequence of empty lines to a single empty line, and writes that single line just before writing a non-empty line or the end of the file. When pre is the empty string, writing it is a no-op.
If you want to preserve multiple blank lines in the middle of the file, build up the sequence of blank lines in pre as you find them, and at the end of the file, only write a single blank line (rather than pre itself) if pre is not empty.
pre = ""
for line in lines:
if line == "\n":
pre += line
else:
op.write(pre)
op.write(line)
pre = ""
if pre:
op.write("\n")
Oops, never rewrite the file that you are reading: it is likely not to work or at best will lead to a maintenance nightmare.
If the file is small enough to fit in main memory, this slight change in your code could be enough:
import os.path
from pathlib import Path
def remove_blanks():
dirname = os.path.dirname(os.path.abspath(__file__))
path: Path = Path(os.path.join(dirname, "data.csv"))
with open(path, "r") as op:
lines = op.readlines() # read lines in memory
with open(path("w") as op: # re-write everything from the beginning
flag = False
for line in lines:
if line == "\n":
if not flag:
op.write(line)
flag = True
else:
op.write(line)
# flag = False # uncomment if you want to keep one blank line
# per group of consecutive lines
You could try using the Counter().
import os
from pathlib import Path
from collections import Counter
def remove_blanks():
dirname = os.path.dirname(os.path.abspath(__file__))
path: Path = Path(os.path.join(dirname, "data.csv"))
with open(path, "r+") as op:
lines = op.readlines()
for line in lines:
count = Counter()
# Add 1 for every time word appears in line
for word in line:
count[word] += 1
# Change the number of newlines to 1
if count['\n'] > 1:
count['\n'] = 1
# Returns list with the number of elements
line = list(count.elements())
I managed to work this out, with this code:
import os
from pathlib import Path
def remove_blanks():
dirname = os.path.dirname(os.path.abspath(__file__))
path: Path = Path(os.path.join(dirname, "data.csv"))
with open(path, "r") as op:
lines = op.readlines() # read lines in memory
with open(path, "w") as op: # re-write everything from the beginning
for line in lines:
if line != "\n":
op.write(line)
else:
continue
It can remove every new line in excess, no matter where it is in the file.
Thanks to everyone who tried to help me!
I have 2 numbers in two similar files. There is a new.txt and original.txt. They both have the same string in them except for a number. The new.txt has a string that says boothNumber="3". The original.txt has a string that says boothNumber="1".
I want to be able to read the new.txt, pick the number 3 out of it and replace the number 1 in original.txt.
Any suggestions? Here is what I am trying.
import re # used to replace string
import sys # some of these are use for other code in my program
def readconfig():
with open("new.text") as f:
with open("original.txt", "w") as f1:
for line in f:
match = re.search(r'(?<=boothNumber=")\d+', line)
for line in f1:
pattern = re.search(r'(?<=boothNumber=")\d+', line)
if re.search(pattern, line):
sys.stdout.write(re.sub(pattern, match, line))
When I run this, my original.txt gets completely cleared of any text.
I did a traceback and I get this:
in readconfig
for line in f1:
io.UnsupportedOperationo: not readable
UPDATE
I tried:
def readconfig(original_txt_path="original.txt",
new_txt_path="new.txt"):
with open(new_txt_path) as f:
for line in f:
if not ('boothNumber=') in line:
continue
booth_number = int(line.replace('boothNumber=', ''))
# do we need check if there are more than one 'boothNumber=...' line?
break
with open(original_txt_path) as f1:
modified_lines = [line.startswith('boothNumber=') if not line
else 'boothNumber={}'.format(booth_number)
for line in f1]
with open(original_txt_path, mode='w') as f1:
f1.writelines(modified_lines)
And I get error:
booth_number = int(line.replace('boothNumber=', ''))
ValueError: invalid literal for int() with base 10: '
(workstationID="1" "1" window=1= area="" extra parts of the line here)\n
the "1" after workstationID="1" is where the boothNumber=" " would normally go. When I open up original.txt, I see that it actually did not change anything.
UPDATE 3
Here is my code in full. Note, the file names are changed but I'm still trying to do the same thing. This is another idea or revision I had that is still not working:
import os
import shutil
import fileinput
import re # used to replace string
import sys # prevents extra lines being inputed in config
# example: sys.stdout.write
def convertconfig(pattern):
source = "template.config"
with fileinput.FileInput(source, inplace=True, backup='.bak') as file:
for line in file:
match = r'(?<=boothNumber=")\d+'
sys.stdout.write(re.sub(match, pattern, line))
def readconfig():
source = "bingo.config"
pattern = r'(?<=boothNumber=")\d+' # !!!!!!!!!! This probably needs fixed
with fileinput.FileInput(source, inplace=True, backup='.bak') as file:
for line in file:
if re.search(pattern, line):
fileinput.close()
convertconfig(pattern)
def copyfrom(servername):
source = r'//' + servername + '/c$/remotedirectory'
dest = r"C:/myprogram"
file = "bingo.config"
try:
shutil.copyfile(os.path.join(source, file), os.path.join(dest, file))
except:
print ("Error")
readconfig()
# begin here
os.system('cls' if os.name == 'nt' else 'clear')
array = []
with open("serverlist.txt", "r") as f:
for servername in f:
copyfrom(servername.strip())
bingo.config is my new file
template.config is my original
It's replacing the number in template.config with the literal string "r'(?<=boothNumber=")\d+'"
So template.config ends up looking like
boothNumber="r'(?<=boothNumber=")\d+'"
instead of
boothNumber="2"
To find boothNumber value we can use next regular expression (checked with regex101)
(?<=\sboothNumber=\")(\d+)(?=\")
Something like this should work
import re
import sys # some of these are use for other code in my program
BOOTH_NUMBER_RE = re.compile('(?<=\sboothNumber=\")(\d+)(?=\")')
search_booth_number = BOOTH_NUMBER_RE.search
replace_booth_number = BOOTH_NUMBER_RE.sub
def readconfig(original_txt_path="original.txt",
new_txt_path="new.txt"):
with open(new_txt_path) as f:
for line in f:
search_res = search_booth_number(line)
if search_res is None:
continue
booth_number = int(search_res.group(0))
# do we need check if there are more than one 'boothNumber=...' line?
break
else:
# no 'boothNumber=...' line was found, so next lines will fail,
# maybe we should raise exception like
# raise Exception('no line starting with "boothNumber" was found')
# or assign some default value
# booth_number = -1
# or just return?
return
with open(original_txt_path) as f:
modified_lines = []
for line in f:
search_res = search_booth_number(line)
if search_res is not None:
line = replace_booth_number(str(booth_number), line)
modified_lines.append(line)
with open(original_txt_path, mode='w') as f:
f.writelines(modified_lines)
Test
# Preparation
with open('new.txt', mode='w') as f:
f.write('some\n')
f.write('<jack Fill workstationID="1" boothNumber="56565" window="17" Code="" area="" section="" location="" touchScreen="False" secureWorkstation="false">')
with open('original.txt', mode='w') as f:
f.write('other\n')
f.write('<jack Fill workstationID="1" boothNumber="23" window="17" Code="" area="" section="" location="" touchScreen="False" secureWorkstation="false">')
# Invocation
readconfig()
# Checking output
with open('original.txt') as f:
for line in f:
# stripping newline character
print(line.rstrip('\n'))
gives
other
<jack Fill workstationID="1" boothNumber="56565" window="17" Code="" area="" section="" location="" touchScreen="False" secureWorkstation="false">
The bingo.config has a string with a random integer in it.
Inside the file, it looks like
boothNumber="2"
My template.config file has the same string with a different integer or number.
I am trying to replace this with the current integer in bingo.config
My problem is that the number in template.config is not being replaced by the number in bingo.config
It's replacing the number in template.config with the literal string "r'(?<=boothNumber=")\d+'"
So template.config ends up looking like
boothNumber="r'(?<=boothNumber=")\d+'"
instead of
boothNumber="2"
It looks like my issue is stemming from how I save the variable "pattern" in the "readconfig" function.
Any ideas on how I can save the integer from bingo.config into a proper variable that can be used?
import os
import shutil
import fileinput
import re # used to replace string
import sys # prevents extra lines being inputed in config
# example: sys.stdout.write
def convertconfig(pattern):
source = "template.config"
with fileinput.FileInput(source, inplace=True, backup='.bak') as file:
for line in file:
match = r'(?<=boothNumber=")\d+'
sys.stdout.write(re.sub(match, pattern, line))
def readconfig():
source = "bingo.config"
pattern = r'(?<=boothNumber=")\d+' # !!!!!!!!!! This probably needs fixed
with fileinput.FileInput(source, inplace=True, backup='.bak') as file:
for line in file:
if re.search(pattern, line):
fileinput.close()
convertconfig(pattern)
def copyfrom(servername):
source = r'//' + servername + '/c$/remotedirectory'
dest = r"C:/myprogram"
file = "bingo.config"
try:
shutil.copyfile(os.path.join(source, file), os.path.join(dest, file))
except:
print ("Error")
readconfig()
# begin here
os.system('cls' if os.name == 'nt' else 'clear')
array = []
with open("serverlist.txt", "r") as f:
for servername in f:
copyfrom(servername.strip())
I am running Python 3.5.1
I have a text file that I'm trying to search through and replace or overwrite text if it matches a predefined variable. Below is a simple example:
test2.txt
A Bunch of Nonsense Stuff
############################
# More Stuff Goes HERE #
############################
More stuff here
Outdated line of information that has no comment above - message_label
The last line in this example needs to be overwritten so the new file looks like below:
test2.txt after script
A Bunch of Nonsense Stuff
############################
# More Stuff Goes HERE #
############################
More stuff here
# This is an important line that needs to be copied
Very Important Line of information that the above line is a comment for - message_label
The function I have written idealAppend does not work as intended and subsequent executions create a bit of a mess. My workaround has been to separate the two lines into single line variables but this doesn't scale well. I want to use this function throughout my script with the ability to handle any number of lines. (if that makes sense)
Script
#!/usr/bin/env python3
import sys, fileinput, os
def main():
file = 'test2.txt'
fullData = r'''
# This is an important line that needs to be copied
Very Important Line of information that the above line is a comment for - message_label
'''
idealAppend(file, fullData)
def idealAppend(filename, data):
label = data.split()[-1] # Grab last word of the Append String
for line in fileinput.input(filename, inplace=1, backup='.bak'):
if line.strip().endswith(label) and line != data: # If a line 2 exists that matches the last word (label)
line = data # Overwrite with new line, comment, new line, and append data.
sys.stdout.write(line) # Write changes to current line
with open(filename, 'r+') as file: # Open File with rw permissions
line_found = any(data in line for line in file) # Search if Append exists in file
if not line_found: # If data does NOT exist
file.seek(0, os.SEEK_END) # Goes to last line of the file
file.write(data) # Write data to the end of the file
if __name__ == "__main__": main()
Workaround Script
This seems to work perfectly as long as I only need to write exactly two lines. I'd love this to be more dynamic when it comes to number of lines so I can reuse the function easily.
#!/usr/bin/env python3
import sys, fileinput, os
def main():
file = 'test2.txt'
comment = r'# This is an important line that needs to be copied'
append = r'Very Important Line of information that the above line is a comment for - message_label'
appendFile(file, comment, append)
def appendFile(filename, comment, append):
label = append.split()[-1] # Grab last word of the Append String
for line in fileinput.input(filename, inplace=1, backup='.bak'):
if line.strip().endswith(label) and line != append: # If a line 2 exists that matches the last word (label)
line = '\n' + comment + '\n' + append # Overwrite with new line, comment, new line, and append data.
sys.stdout.write(line) # Write changes to current line
with open(filename, 'r+') as file: # Open File with rw permissions
line_found = any(append in line for line in file) # Search if Append exists in file
if not line_found: # If data does NOT exist
file.seek(0, os.SEEK_END) # Goes to last line of the file
file.write('\n' + comment + '\n' + append) # Write data to the end of the file
if __name__ == "__main__": main()
I am very new to Python so I'm hoping there is a simple solution that I overlooked. I thought it might make sense to try and split the fullData variable at the new line characters into a list or tuple, filter the label from the last item in the list, then output all entries but this is starting to move beyond what I've learned so far.
If I understand your issue correctly, you can just open the input and output files, then check whether the line contains old information and ends with the label and write the appropriate content accordingly.
with open('in.txt') as f, open('out.txt', 'r') as output:
for line in f:
if line.endswith(label) and not line.startswith(new_info):
output.write(replacement_text)
else:
output.write(line)
If you want to update the original file instead of creating a second one, it's easiest to just delete the original and rename the new one instead of trying to modify it in place.
Is this what you are looking for ? It's looking for a label and then replaces the whole line with whatever you want.
test2.txt
A Bunch of Nonsense Stuff
############################
# More Stuff Goes HERE #
############################
More stuff here
Here is to be replaced - to_replace
script.py
#!/usr/bin/env python3
def main():
file = 'test2.txt'
label_to_modify = "to_replace"
replace_with = "# Blabla\nMultiline\nHello"
"""
# Raw string stored in a file
file_replace_with = 'replace_with.txt'
with open(file_replace_with, 'r') as f:
replace_with = f.read()
"""
appendFile(file, label_to_modify, replace_with)
def appendFile(filename, label_to_modify, replace_with):
new_file = []
with open(filename, 'r') as f:
for line in f:
if len(line.split()) > 0 and line.split()[-1] == label_to_modify:
new_file.append(replace_with)
else:
new_file.append(line)
with open(filename + ".bak", 'w') as f:
f.write(''.join(new_file))
if __name__ == "__main__": main()
test2.txt.bak
A Bunch of Nonsense Stuff
############################
# More Stuff Goes HERE #
############################
More stuff here
# Blabla
Multiline
Hello
Reading over both answers I've come up with the following as the best solution i can get to work. It seems to do everything I need. Thanks Everyone.
#!/usr/bin/env python3
def main():
testConfFile = 'test2.txt' # /etc/apache2/apache2.conf
testConfLabel = 'timed_combined'
testConfData = r'''###This is an important line that needs to be copied - ##-#-####
Very Important Line of information that the above line is a \"r\" comment for - message_label'''
testFormatAppend(testConfFile, testConfData, testConfLabel) # Add new test format
def testFormatAppend(filename, data, label):
dataSplit = data.splitlines()
fileDataStr = ''
with open(filename, 'r') as file:
fileData = stringToDictByLine(file)
for key, val in fileData.items():
for row in dataSplit:
if val.strip().endswith(row.strip().split()[-1]):
fileData[key] = ''
fileLen = len(fileData)
if fileData[fileLen] == '':
fileLen += 1
fileData[fileLen] = data
else:
fileLen += 1
fileData[fileLen] = '\n' + data
for key, val in fileData.items():
fileDataStr += val
with open(filename, 'w') as file:
file.writelines(str(fileDataStr))
def stringToDictByLine(data):
fileData = {}
i = 1
for line in data:
fileData[i] = line
i += 1
return fileData
if __name__ == "__main__": main()
I am making a flashcard program in which I take a text file that contains several columns, such as english word, french equivalent, gender, type of word, etc. My idea was to create a loop that read each line of the text file, separating by tabs, and makes an instance of a user-defined Word object for each line.
In the following block code I import the text file, process it into a list, then attempt to create an instance of a previously defined object: Word. I would like the object to have the second item on the list for it's name so that it is easily searchable, but it's not letting me do this, please can somebody help me with the code:
file = (open('dictionary.txt', 'r')).readline()
import re
line_list = re.split(r'\t', file.rstrip('\n'))
line_list[1] = Word(line_list[0], line_list[1], line_list[2], line_list[3])
Create a dict of instances and use the second item of the lists as key. It's a bad idea to create dynamic variables.
import re
instance_dict = {}
with open('dictionary.txt') as f:
for line in f:
line_list = re.split(r'\t', line.rstrip('\n'))
instance_dict[line_list[1]] = Word(*line_list[:4])
Why the with statement?
It is good practice to use the with keyword when dealing with file
objects. This has the advantage that the file is properly closed after
its suite finishes, even if an exception is raised on the way.
You can also use the csv module:
import csv
instances = {}
with open('dictionary.txt', 'rb') as f:
reader = csv.reader(f, delimiter='\t')
instances = {line[1]: Word(*line) for line in reader}
Here's a cleaner solution using a namedtuple. You'll end up with a dict called "words" which you use to lookup each by name.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pprint
from collections import namedtuple
Word = namedtuple('Word', ['name', 'french', 'gender', 'type_'])
words = {}
with open('dictionary.txt', 'rU') as fin:
for word in (Word(*r.rstrip('\n').split('\t')) for r in fin):
words[word.name] = word
pprint.pprint(words)
Firstly, it's better to use with, as statements to get input from files, as the closing procedures are automatically taken care of. Secondly, to read ALL of the lines from a file, you must use readlines() rather than readline(). Try something like this :
with open('dictionary.txt','r') as file :
line_list = file.readlines()
splitLineList = []
for lines in line_list :
splitLineList.append(re.split(r'\t',lines.strip('\n'))
You may have an appropriate solution depending on few clarification on your requirements
"My idea was to create a loop that read each line of the text file,
separating by tabs, and"
If the text file is already pre-validated or reliable to ignore error-handling (e.g. not evenly separated by single tabs).
with open('dictionary.txt', 'r') as f:
[line.strip().split("\t")
for line in f.read().split("\n")
if line.strip()]
will get you the (comprehensive) list required to create Word object instances, without using re
"then attempt to create an instance of a previously defined object:
Word."
with open('dictionary.txt', 'r') as f:
[Word(line.strip().split("\t"))
for line in f.read().split("\n")
if line.strip()]
"I would like the object to have the second item on the list for it's
name so that it is easily searchable,"
Can you rewrite this with an example?
but it's not letting me do this,
line_list[1] = Word(line_list[0], line_list[1], line_list[2], line_list[3])
Sorry I am loosing you here, why are using line_list[1] to refer newly created Word instances where line_list[1] itself is an argument ?
With your clarification, I would have something like this
Reworked Code:
from pprint import pprint
My assumption on your Class definition:
class Word():
def __init__(self, **kwargs):
self.set_attrs(**kwargs)
def __call__(self):
return self.get_attr("swedish_word")
def set_attrs(self, **kwargs):
for k, v in kwargs.iteritems():
setattr(self, k, v)
def get_attr(self, attr):
return getattr(self, attr)
def get_attrs(self):
return ({attr.upper():getattr(self, attr) for attr in self.__dict__.keys()})
def print_attrs(self):
pprint(self.get_attrs())
if __name__ == '__main__':
# sample entries in dictionary.txt
# swedish_word english_word article word_type
# hund dog ett noun
# katt cat ett noun
# sova sleep ett verb
with open('dictionary.txt', 'r') as f:
header = f.readline().strip().split("\t")
instances = [Word(**dict(zip(header, line.strip().split("\t"))))
for line in f.read().split("\n")
if line.strip()]
# for line in f.read().split("\n"):
# data = dict(zip(header, line.strip().split("\t")))
# w = Word(**data)
You can get instance properties for a given swedish_word like this
def print_swedish_word_properties(swedish_word):
for instance in instances:
if instance() == swedish_word:
print "Properties for Swedish Word:", swedish_word
instance.print_attrs()
print_swedish_word_properties("hund")
to have output like this
Properties for Swedish Word: hund
{'ARTICLE': 'ett',
'ENGLISH_WORD': 'dog',
'SWEDISH_WORD': 'hund',
'WORD_TYPE': 'noun'}
or you can use any other class methods to search instances on various attributes