Simple way to remove duplicate whitespaces and remove all \n efficiently - python

I have a file called test.txt It has a bunch of duplicate spaces. The test.txt file contains HTML. I want to remove all the unnessary whitespace to reduce the size of contents in the test.txt file. How can I remove the duplicate spaces and make the entire string on one line.
test.txt
<center>
<b class="test" >My name
is
fred</ b> <center>
What I want to print
<center><b class="test">My name is fred</b><center>
What gets printed
<center><b class="test" >Mynameisfred</b> <center>
program.py
def is_white_space(before, curr, after):
# remove duplicate spaces
if (curr == " " and (before == " " or after == " ")):
return True
# Remove all \n
elif (curr == "\n"):
return True
return False
f = open('test.txt', 'r')
contents = f.read()
f.close()
new = "";
i = 0
while (i < len(contents)):
if (i != 0 and
i != (len(contents) - 1) and
not is_white_space(contents[i - 1], contents[i], contents[i + 1])):
new += contents[i]
i += 1
print(new)

This will leave a space between digits or letters.
from string import ascii_letters, digits
def main():
with open('test.txt', 'r') as f:
parts = f.read().split()
keep_separated = set(ascii_letters) | set(digits)
for i in range(len(parts) - 1):
if parts[i][-1] in keep_separated and parts[i + 1][0] in keep_separated:
parts[i] = parts[i] + " "
print(''.join(parts))
if __name__ == '__main__':
main()

Related

obfuscation of a text file using python - by reversing the words and inserting a specific number of random characters between them

Beginner Coding problem I am supposed to write a code that reverses the contents of a file and then inserts a number of random characters based on a strength the user chooses. It then creates a new file containing the obstructed file.
For example, if the user chooses strength = 2, it will insert 2 random characters between each letter in the text file: The cat sits ---> sgyt6gilns t7faxdc e3dh1kT
Right now my program inserts too many characters in between and I can't figure out why.
This is what it's doing:
input: CAT
Output of strength = 1: TeAEADQoC
import string
import random
def getRandomChar():
alpha = string.ascii_letters + string.digits
return random.choice(alpha)
def randomString(EncrypStrength):
count = 0
result = ''
while count < len(EncrypStrength):
result += getRandomChar()
count += 1
return result
def ReverseString(OrigFile):
return OrigFile[::-1]
def LineEncrypt(line, EncrypStrength):
EncrypStrength = ReverseString(line)
index = 0
newline = EncrypStrength[index]
index += 1
while index < len(EncrypStrength):
newline += randomString(EncrypStrength)
newline += EncrypStrength[index]
index += 1
return newline
def main():
OrigFile =input('Original File Name:')
EncryptedFile = input("obfuscated File Name:")
EncrypStrength = int(input('Enter the Encryption Strength:'))
Orig = open(OrigFile, 'r')
Encrypted = open(EncryptedFile, 'w')
line = Orig.readline()
while line!= '':
encryptLine = LineEncrypt(line, EncrypStrength)
Encrypted.write(encryptLine +"\n")
line = Orig.readline()
Orig.close()
Encrypted.close()
if __name__=="__main__":
main()
In Line Encrypt method you are using incorrectly Encrypt Strength, you are overriding the number of characters to put as EncryptStrength with reversed line.
def LineEncrypt(line, EncrypStrength):
reversedString = ReverseString(line)
index = 0
newline = reversedString[index]
index += 1
while index < len(reversedString):
newline += randomString(EncrypStrength)
newline += reversedString[index]
index += 1
You are confusing EncrypStrength and overriding it as Ritesh mentioned.
Here is the full corrected code, I hope it will work as you expected.
import string
import random
def getRandomChar():
alpha = string.ascii_letters + string.digits
return random.choice(alpha)
def randomString(EncrypStrength):
count = 0
result = ''
while count < EncrypStrength:
result += getRandomChar()
count += 1
return result
def ReverseString(OrigFile):
return OrigFile[::-1]
def LineEncrypt(line, EncrypStrength):
RevStr = ReverseString(line)
index = 0
newline = RevStr[index]
index += 1
while index < len(RevStr):
newline += randomString(EncrypStrength)
newline += RevStr[index]
index += 1
return newline
def main():
OrigFile =input('Original File Name:')
EncryptedFile = input("obfuscated File Name:")
EncrypStrength = int(input('Enter the Encryption Strength:'))
Orig = open(OrigFile, 'r')
Encrypted = open(EncryptedFile, 'w')
line = Orig.readline()
while line!= '':
encryptLine = LineEncrypt(line, EncrypStrength)
Encrypted.write(encryptLine +"\n")
line = Orig.readline()
Orig.close()
Encrypted.close()
if __name__=="__main__":
main()

Finding honorific text and then capital letters in Python

file = open('AllCongress.txt', "r")
lines = file.readlines()
file.close()
a = ""
b = ""
c = ""
d = ""
e = ""
f = ""
g = ""
y = ""
z = ""
for x in lines:
if ((x == "M") or (a == "M")):
a == "M"
if((x == "r") or (b == "r")):
b == "r"
if((x == ".") or (c == ".")):
c == "."
if((x == " ") or (d == " ")):
d == " "
if((x == x.isupper()) or (e == "A"):
e == "A"
if((x == x.isupper()) and ((e == "A") and (f == "A"))):
f == "A"
if((x == x.isupper()) or (g == g.isupper())):
g == "A"
I'm trying to divide sections of a .txt file into two separate files based on whether it contains a male or female speaker. Speakers in this file type have their last name in all caps after the honorific. So the male honorific format for speakers in this file is "Mr. XYZ" with XYZ being any 3+ capital letters (3 capital letters in a row is enough to detect anybody in the file as a speaker). The female format is similar, just either "Ms. XYZ" or "Mrs. XYZ".
I want to get all of the text after a speaker name like that is listed and then sort them into separate male and female text file, until the next speaker speaks where I have to determine gender again to sort.
Unfortunately though, I'm new to Python and I'm unable to figure out a way that I can check for both "Mr. " or "Mrs. " and then at least 3 capital letters in a row afterwards. I really just need a way to detect this and I can probably figure out the rest. The code above is this really messy and unfinished way I tried to capture the "Mr. XYZ" part of text.
here is some code (if you have questions -> ask):
with open('yourfile.txt') as file:
lines = file.read()
lines = lines.split(' ')
for index, word in enumerate(lines):
if word == 'Mr.' and lines[index + 1].isupper():
prefix = 'Mr. '
name = lines[index + 1]
print(prefix + name)
elif word == 'Mrs.' and lines[index + 1].isupper():
prefix = 'Mrs. '
name = lines[index + 1]
print(prefix + name)
I suggest you use with statement for opening files. Basically you loop over each word and check if it eaquals what you need.
Use String functions like startswith() or endswith()
And don't close the file before reading lines
file = open('AllCongress.txt', "r")
lines = file.readlines()
lines = lines.split(' ')
for x in lines:
if(x.startswith("Mr."):
*code here*
elif(x.startswith("Ms."):
*code here*

Python for loop - stripping line as you go

I am trying to strip a line of code so that only the comment at the end is saved. Because # signs can be included within "" marks, to do this I am trying to cycle through the line catching pairs of " marks so that it ignores any # marks within "" marks. When I use a code visualiser on my code below, after the second for loop it seems to go pack to processing s as if it has just stripped the first " mark. I can't see what I'm doing wrong here, because the print statement I have included on line 19 shows that s has been stripped to after the second ", but when the code returns to the top, it starts cycling again from after the first ". Any idea of what I am doing wrong here?
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
quoteCount = 0
for char in s:
if quoteCount%2 == 0:
if char == '#':
s = s[s.index('#'):]
break
if char == '"':
quoteCount = quoteCount + 1
s = s[s.index('"'):]
s = s.lstrip('"')
for char in s:
if char == '"':
quoteCount = quoteCount + 1
s = s[s.index('"'):]
s = s.lstrip('"')
print(s)
break
print(s)
If I understand your question correctly you only want to keep the last comment (#lots of hash(#) symbols here).
To do this you don't need the nested for loop.
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
quoteCount = 0
for char in s:
if quoteCount%2 == 0:
if char == '#':
s = s[s.index('#'):]
break
if char == '"':
quoteCount = quoteCount + 1
s = s[s.index('"'):]
s = s.lstrip('"')
print(s)
Easier to remove the quoted strings with a regular expression:
import re
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
pattern = r'"[^"]*"'
s = re.sub(pattern, '', s)
print s[s.index('#'):]
Output:
#lots of hash(#) symbols here
Your code is overly complicated so I suggest you use an alternative method to finding the comment like the already mentioned regex one or the one I came up with.
s = '("8# " + str" #9 " + line) #lots of hash(#) symbols here'
s = s[s.rfind('"') + 1:] # Get to the last quotation mark
if s.find('#') >= 0: # The first # sign should start the comment if there is one
s = s[s.find('#'):]
else:
s = '' # No comment was found
print(s)

Stop replacement by pattern?

Say my file look like this:
some lines
tom
some lines
beginword a b
some lines
endword
jim
some lines
beginword x y
some lines
endword
...
Want to be:
some lines
tom
some lines
beginword ZZ b
some lines
endword
jim
some lines
beginword x y
some lines
endword
So this is my python code:
input = open("file", "r")
output = open("file_updated", "w")
dummy = ""
item = []
for line in input:
dummy += line
if line.find("tom" + "\n") != -1:
for line in input:
if line.find("beginword") != -1:
item = line.split()
dummy += item[0] + " w " + item[-1] + "\n"
else:
dummy += line
output.write(dummy)
input.close()
output.close()
It replace all lines contain "beginword", include the lines belong to "jim", how can I stop the replacement by "endword" belong to "tom"?
Use break statement
input = open("file", "r")
output = open("file_updated", "w")
dummy = ""
item = []
for line in input:
dummy += line
if line.find("tom" + "\n") != -1:
for line in input:
# check for endword and exit for loop
if line.find("endword" + "\n") == 0:
dummy += line
break
if line.find("beginword") != -1:
item = line.split()
dummy += item[0] + " w " + item[-1] + "\n"
else:
dummy += line
output.write(dummy)
input.close()
output.close()
Also, have a look at these:
reading and writing files
regular expressions

.split(",") separating every character of a string

At some point of the program I ask it to take the user's text input and separate the text according to it's commas, and then I ",".join it again in a txt file. The idea is to have a list with all the comma separated information.
The problem is that, apparently, when I ",".join it, it separates every single character with commas, so if I've got the string info1,info2 it separates, getting info1 | info2, but then, when joining it back again it ends like i,n,f,o,1,,,i,n,f,o,2, which is highly unconfortable, since it get's the text back from the txt file to show it to the user later in the program. Can anyone help me with that?
categories = open('c:/digitalLibrary/' + connectedUser + '/category.txt', 'a')
categories.write(BookCategory + '\n')
categories.close()
categories = open('c:/digitalLibrary/' + connectedUser + '/category.txt', 'r')
categoryList = categories.readlines()
categories.close()
for category in BookCategory.split(','):
for readCategory in lastReadCategoriesList:
if readCategory.split(',')[0] == category.strip():
count = int(readCategory.split(',')[1])
count += 1
i = lastReadCategoriesList.index(readCategory)
lastReadCategoriesList[i] = category.strip() + "," + str(count).strip()
isThere = True
if not isThere:
lastReadCategoriesList.append(category.strip() + ",1")
isThere = False
lastReadCategories = open('c:/digitalLibrary/' + connectedUser + '/lastReadCategories.txt', 'w')
for category in lastReadCategoriesList:
if category.split(',')[0] != "" and category != "":
lastReadCategories.write(category + '\n')
lastReadCategories.close()
global finalList
finalList.append({"Title":BookTitle + '\n', "Author":AuthorName + '\n', "Borrowed":IsBorrowed + '\n', "Read":readList[len(readList)-1], "BeingRead":readingList[len(readingList)-1], "Category":BookCategory + '\n', "Collection":BookCollection + '\n', "Comments":BookComments + '\n'})
finalList = sorted(finalList, key=itemgetter('Title'))
for i in range(len(finalList)):
categoryList[i] = finalList[i]["Category"]
toAppend = (str(i + 1) + ".").ljust(7) + finalList[i]['Title'].strip()
s.append(toAppend)
categories = open('c:/digitalLibrary/' + connectedUser + '/category.txt', 'w')
for i in range(len(categoryList)):
categories.write(",".join(categoryList[i]))
categories.close()
You should pass ''.join() a list, you are passing in a single string instead.
Strings are sequences too, so ''.join() treats every character as a separate element instead:
>>> ','.join('Hello world')
'H,e,l,l,o, ,w,o,r,l,d'
>>> ','.join(['Hello', 'world'])
'Hello,world'

Categories

Resources