I have files where bash string variables are gradually appended:
URI += "path \
path \
path \
"
<some other code>
#URI += "path"
URI += "path \
path"
As you may notice there are different way of appendings, partly over several lines. There is other code as well in those files.
Now I tried to write a function which gets the content of the variables (everything between the quotes):
def grepVar(filepath, var):
list = []
with open(filepath, "r") as file:
for num, line in enumerate(file, 1):
if var in line:
if line.count('"') is 2:
list.append(line)
# until here it works for "URIs" over 1 line
else:
num = num + 1
while(line.count('"') is 0):
list.append(line)
num = num + 1
return list
print grepVar(path, "URI")
So In the else condition I try to raise the loop manually and append all lines until another quote would appear (while-loop). I am not sure if I can tie on this idea or if I have to discard it completely. In this case could you pls give me hints how to solve my problems? I am not sure if I described it well since its kind of specific.
As line if given through a higher level for num, line in enumerate(file, 1): loop, you cannot use a while (line...) inside that loop.
A common way to solve this problem is to save state between lines. You function could become (I removed num management because I could not understand the requirement):
def grepVar(filepath, var):
lst = []
inquote = False
with open(filepath, "r") as fil:
for num, line in enumerate(fil, 1):
if inquote:
lst.append(line)
if line.count('"') > 0:
inquote = False
elif var in line:
if line.count('"') == 2:
lst.append(line)
else:
lst.append(line)
inquote = True
return lst
You should also avoid to use standard Python words such as list of file for your own variables, because the hide the standard meanings.
Related
import os
def encrypt(filename,extension):
file = open(filename+'.'+extension,'r',encoding="Latin-1")
red = str(file.read())
final = [extension+'Q']
fin = ''
for x in range(len(red) - 1):
if red[x] == '0':
final.append('#!#')
else:
final.append(red[x])
for x in final:
fin += x
file.close()
os.system('del '+filename+'.'+extension)
file = open(filename,'a')
file.close()
file = open(filename,'w',encoding="Latin-1")
file.write(fin)
file.close()
def decrypt(filename):
ignore = 0
file = open(filename,'r',encoding="Latin-1")
dat = str(file.read())
final = []
fin = ''
exten = ''
is_done = False
for x in range(len(dat) - 1):
if dat[x] == '#' and dat[x + 1] == '!' and dat[x + 2] == '#' and ignore == 0:
final.append('0')
ignore = 2
elif ignore == 0 and is_done == True:
final.append(dat[x])
elif not ignore < 1:
ignore -= 1
if dat[x] == 'Q':
is_done = True
if is_done == False:
exten += dat[x]
for x in final:
fin += x
print(filename)
print(exten)
file.close()
file = open(filename+'.'+exten,'a')
file.close()
file = open(filename+'.'+exten,'w',encoding="Latin-1")
file.write(str(fin))
file.close()
os.system('del '+filename)
decrypt('eee')
#encrypt('eee','png')
#print(open('test.docx','r',encoding="Latin-1").readlines())
#print(open('eee.png','r',encoding='latin-1').read())
i just want it to work as i expect to work and i dont have idea what breaks it help btw when i open files with notepad it looks same so idk whats wrong at all thats weird , sorry for that my code is messy XD thats how my every code looks like
What do you mean by it 'breaks'? What I'm seeing in your code is:
When you run open(filename,...), you assume file is correctly holding the file object. You have no error-checking in either encrypt() or decrypt(). After those lines, read the documentation for the function and see what it returns. You should be able to do something in the lines of:
try:
file = open(filename,'r',encoding="Latin-1")
except err:
...
# start file operations here.
File handling can and will go wrong at some point. You can give the program any file that you can specify, but you need to make sure that operation will run successfully. Common file errors are it not existing, being already open, or the permissions being wrong
In both encrypt() and decrypt() you repeat this block:
os.system('del '+filename+'.'+extension)
file = open(filename,'a')
file.close()
file = open(filename,'w',encoding="Latin-1")
file.write(fin)
file.close()
where you tell the system to DELETE your file and then re-open it. I suspect that is a part of why you have issues with the file. I'd also consider removing the second close() & open() calls to see if you can change the mode without manipulating the file pointer. If this block of code has to be in both parts, separate it into its own function. And run error handling on Everything.
I have a .txt file of amino acids separated by ">node" like this:
Filename.txt :
>NODE_1
MSETLVLTRPDDWHVHLRDGAALQSVVPYTARQFARAIAMPNLKPPITTAEQAQAYRERI
KFFLGTDSAPHASVMKENSVCGAGCFTALSALELYAEAFEAAGALDKLEAFASFHGADFY
GLPRNTTQVTLRKTEWTLPESVPFGEAAQLKPLRGGEALRWKLD*
>NODE_2
MSTWHKVQGRPKAQARRPGRKSKDDFVTRVEHDAKNDALLQLVRAEWAMLRSDIATFRGD
MVERFGKVEGEITGIKGQIDGLKGEMQGVKGEVEGLRGSLTTTQWVVGTAMALLAVVTQV
PSIISAYRFPPAGSSAFPAPGSLPTVPGSPASAASAP*
I want to separate this file into two (or as many as there are nodes) files;
Filename1.txt :
>NODE
MSETLVLTRPDDWHVHLRDGAALQSVVPYTARQFARAIAMPNLKPPITTAEQAQAYRERI
KFFLGTDSAPHASVMKENSVCGAGCFTALSALELYAEAFEAAGALDKLEAFASFHGADFY
GLPRNTTQVTLRKTEWTLPESVPFGEAAQLKPLRGGEALRWKLD*
Filename2.txt :
>NODE
MSTWHKVQGRPKAQARRPGRKSKDDFVTRVEHDAKNDALLQLVRAEWAMLRSDIATFRGD
MVERFGKVEGEITGIKGQIDGLKGEMQGVKGEVEGLRGSLTTTQWVVGTAMALLAVVTQV
PSIISAYRFPPAGSSAFPAPGSLPTVPGSPASAASAP*
With a number after the filename
This code works, however it deletes the ">NODE" line and does not create a file for the last node (the one without a '>' afterwards).
with open('FilePathway') as fo:
op = ''
start = 0
cntr = 1
for x in fo.read().split("\n"):
if x.startswith('>'):
if start == 1:
with open (str(cntr) + '.fasta','w') as opf:
opf.write(op)
opf.close()
op = ''
cntr += 1
else:
start = 1
else:
if op == '':
op = x
else:
op = op + '\n' + x
fo.close()
I canĀ“t seem to find the mistake. Would be thankful if you could point it out to me.
Thank you for your help!
Hi again! Thank you for all the comments. With your help, I managed to get it to work perfectly. For anyone with similar problems, this is my final code:
import os
import glob
folder_path = 'FilePathway'
for filename in glob.glob(os.path.join(folder_path, '*.fasta')):
with open(filename) as fo:
for line in fo.readlines():
if line.startswith('>'):
original = line
content = [original]
fileno = 1
filename = filename
y = filename.replace(".fasta","_")
def writefasta():
global content, fileno
if len(content) > 1:
with open(f'{y}{fileno}.fasta', 'w') as fout:
fout.write(''.join(content))
content = [line]
fileno += 1
with open('FilePathway') as fin:
for line in fin:
if line.startswith('>NODE'):
writefasta()
else:
content.append(line)
writefasta()
You could do it like this:
def writefasta(d):
if len(d['content']) > 1:
with open(f'Filename{d["fileno"]}.fasta', 'w') as fout:
fout.write(''.join(d['content']))
d['content'] = ['>NODE\n']
d['fileno'] += 1
with open('test.fasta') as fin:
D = {'content': ['>NODE\n'], 'fileno': 1}
for line in fin:
if line.startswith('>NODE'):
writefasta(D)
else:
D['content'].append(line)
writefasta(D)
This would be better way. It is going to write only on odd iterations. So that, ">NODE" will be skipped and files will be created only for the real content.
with open('filename.txt') as fo:
cntr=1
for i,content in enumerate(fo.read().split("\n")):
if i%2 == 1:
with open (str(cntr) + '.txt','w') as opf:
opf.write(content)
cntr += 1
By the way, since you are using context manager, you dont need to close the file.
Context managers allow you to allocate and release resources precisely
when you want to. It opens the file, writes some data to it and then
closes it.
Please check: https://book.pythontips.com/en/latest/context_managers.html
with open('FileName') as fo:
cntr = 1
for line in fo.readlines():
with open (f'{str(cntr)}.fasta','w') as opf:
opf.write(line)
opf.close()
op = ''
cntr += 1
fo.close()
I am trying to extract IPv4 addresses from a text file and save them as a list to a new file, however, I can not use regex to parse the file, Instead, I have check the characters individually. Not really sure where to start with that, everything I find seems to have import re as the first line.
So far this is what I have,
#Opens and prints wireShark txt file
fileObject = open("wireShark.txt", "r")
data = fileObject.read()
print(data)
#Save IP adresses to new file
with open('wireShark.txt') as fin, open('IPAdressess.txt', 'wt') as fout:
list(fout.write(line) for line in fin if line.rstrip())
#Opens and prints IPAdressess txt file
fileObject = open("IPAdressess.txt", "r")
data = fileObject.read()
print(data)
#Close Files
fin.close()
fout.close()
So I open the file, and I have created the file that I will put the extracted IP's in, I just don't know ow to pull them without using REGEX.
Thanks for the help.
Here is a possible solution.
The function find_first_digit, position the index at the next digit in the text if any and return True. Else return False
The functions get_dot and get_num read a number/dot and, lets the index at the position just after the number/dot and return the number/dot as str. If one of those functions fails to get the number/dot raise an MissMatch exception.
In the main loop, find a digit, save the index and then try to get an ip.
If sucess, write it to output file.
If any of the called functions raises a MissMatch exception, set the current index to the saved index plus one and start over.
class MissMatch(Exception):pass
INPUT_FILE_NAME = 'text'
OUTPUT_FILE_NAME = 'ip_list'
def find_first_digit():
while True:
c = input_file.read(1)
if not c: # EOF found!
return False
elif c.isdigit():
input_file.seek(input_file.tell() - 1)
return True
def get_num():
num = input_file.read(1) # 1st digit
if not num.isdigit():
raise MissMatch
if num != '0':
for i in range(2): # 2nd 3th digits
c = input_file.read(1)
if c.isdigit():
num += c
else:
input_file.seek(input_file.tell() - 1)
break
return num
def get_dot():
if input_file.read(1) == '.':
return '.'
else:
raise MissMatch
with open(INPUT_FILE_NAME) as input_file, open(OUTPUT_FILE_NAME, 'w') as output_file:
while True:
ip = ''
if not find_first_digit():
break
saved_position = input_file.tell()
try:
ip = get_num() + get_dot() \
+ get_num() + get_dot() \
+ get_num() + get_dot() \
+ get_num()
except MissMatch:
input_file.seek(saved_position + 1)
else:
output_file.write(ip + '\n')
#
# Obtain user input for file name, and open it
#
inFile = open(input("Enter file name: "), "r")
#
# Process data and address possible errors
#
countDinner = 0
countLodging = 0
countConference = 0
valueDinner = 0
valueLodging = 0
valueConference = 0
done = False
while not done :
line = inFile.readline()
try :
s = line
serviceAmount = ';'.join(s.split(';')[1:-1]) #Removes date and name regardless of format
serviceAmount.split(";")
s.lower()
if "dinner" in s :
countDinner = countDinner + 1
valueDinner = valueDinner + int(filter(str.isdigit, s))
print("Dinners: ", countDinner, "Value of Dinner sales: ", valueDinner)
elif "lodging" in s :
countLodging = countLodging + 1
valueLodging = valueLodging + int(filter(str.isdigit, s))
print("Lodging: ", countLodging, "Value of Lodging sales: ", valueLodging)
elif "conference" in s :
countConference = countConference + 1
valueConference = valueConference + int(filter(str.isdigit, s))
print("Conferences: ", countConference, "Value of Conference sales: ", valueConference)
elif line == "" :
done = True
else :
print("Invalid file format.")
except FileNotFoundError :
print("Unable to find file.")
finally :
done = True
inFile.close()
Returns "Invalid file format" even when the document is set up specifically for this code. I'm not getting a syntax error, so I'm not sure whats wrong.
The document contains the text:
John;Lodging;123;050617
Tyler;Conference;123;081497
Taylor;Dinner;453;041798
There are a lot of things you aren't doing quite right here. I tried to not only fix the issue you posted about, but also write some code that should be more clear and easier to use. I left comments to explain things.
# Don't open the file here, just get the file name. We will open in later
fname = input("Enter file name: ")
# I think using dicts is more clearn and organized. Having so many variables I think makes the code messy
counts = {"Dinner": 0,
"Lodging": 0,
"Conference": 0}
values = {"Dinner": 0,
"Lodging": 0,
"Conference": 0}
# Lets try to open the file
try:
with open(fname, 'r') as inFile: # Use "with", this way the file is closed automatically when we are done reading it
for linenum, line in enumerate(inFile): # I want to enumerate each line. If there is an error on a line, we can display the line nmber this way
line = line.lower().split(';')[1:-1] # lets make it all lower case, then split and drop as needed
print(line)
if "dinner" in line :
counts["Dinner"] += 1 # x += 1 is the same as x = x + 1, but cleaner
values["Dinner"] += int(line[1])
print("Dinners: {} Value of Dinner sales: {}".format(counts["Dinner"], values["Dinner"]))
elif "lodging" in line :
counts["Lodging"] += 1
values["Lodging"] += int(line[1])
print("Lodging: {} Value of Dinner sales: {}".format(counts["Lodging"], values["Lodging"]))
elif "conference" in line :
counts["Conference"] += 1
values["Conference"] += int(line[1])
print("Conference: {} Value of Dinner sales: {}".format(counts["Conference"], values["Conference"]))
else :
print("Invalid file format on line {}".format(linenum)) # Here is why we used enumerate in the for loop
except FileNotFoundError:
print("Unable to find file.")
Here is your problem:
serviceAmount = ';'.join(s.split(';')[1:-1]) #Removes date and name regardless of format
serviceAmount.split(";")
You should do:
serviceAmount = ';'.join(s.lower().split(';')[1:-1])
You are checking against lower case strings, but not actually lower casing your input.
It is also important to note that s.lower() doesn't actually change s, it just returns a string where all the letters of s have been switched to lower case. Same thing for split (as in not changing the string its called on, not that it returns a string).
Another problem you are going to run into is getting the numbers from your strings.
int(filter(str.isdigit, s))
Won't work. You can use split again like you did earlier (or just not re-join since you only care about the first element in the comparisons).
int(serviceAmount.split(';')[1])
The last thing is the
finally:
done = True
inFile.close()
finally always runs when exiting a try, meaning that you are always done after each loop (and close the file after you read the first line).
If you remove the finally and add inFile.close() inside the elif line == "" it will close, and set done only when you've reached the end of the file.
It could be done as simple as
categories = {}
filename = input("Enter file name: ")
with open(filename, "r") as file:
name, category, value, date = file.readline().split(";")
if category not in categories:
categories[category] = {"count": 0, "value": 0}
categories[category]["count"] += 1
categories[category]["value"] += int(value)
At the end, you'll have a dict with categories, their count, and value, and also their names are not hard-coded.
I am processing many text files which (some of them) contain uuencoding which can be .jpg or .pdf or .zip of .xlsx etc. I don't care about the embedded UUencoded data, so I would just like to discard these passages and keep the rest of the text. I'm struggling with how to come up with a method to skip only just enough, but not too much.
To summarize http://en.wikipedia.org/wiki/Uuencoding each blob begins with
begin 644 filename.extension
every line after the begin 644 seems to start by the letter
M
so this might also help. Any idea how to have a function that deletes all these lines for all .txt files in a folder (directory)?
For example, the following is a .jpg uuencoding
GRAPHIC
18
g438975g32h99a01.jpg
begin 644 g438975g32h99a01.jpg
M_]C_X``02D9)1#`!`#$`8`!#``#_[0G64&AO;=&]S:&]P(#,N,``X0DE-`^T`
M`````!``8`````$``0!#`````0`!.$))300-```````$````'CA"24T$&0``
M````!````!XX0DE-`_,```````D```````````$`.$))300*```````!```X
M0DE-)Q````````H``0`````````".$))30/U``````!(`"]F9#`!`&QF9;#`&
M```````!`"]F9#`!`*&9F#`&```````!`#(````!`%H````&```````!`#4`
M```!`"T````&```````!.$))30/X``````!P``#_____________________
M________`^#`````_____________________________P/H`````/______
M______________________\#Z`````#_____________________________
M`^#``#A"24T$"```````$`````$```)````"0``````X0DE-!!X```````0`
M````.$))300:``````!M````!#``````````````)P```+`````&`&<`,P`R
M`&#`.0`Y`````0`````````````````````````!``````````````"P````
M)P`````````````````````````````````````````````X0DE-!!$`````
M``$!`#A"24T$%```````!`````(X0DE-!`P`````!SH````!````<````!D`
M``%0```#T```!QX`&``!_]C_X``02D9)1#`!`#$`2`!(``#_[#`.061O8F4`
M9(`````!_]L`A``,"`#("0#,"0D,$0L*"Q$5#PP,#Q48$Q,5$Q,8$0P,#`P,
M#!$,#`P,#`P,#`P,#`P,#`P,#`P,#`P,#`P,#`P,`0T+"PT.#1`.#A`4##X.
M%!0.##X.%!$,#`P,#!$1#`P,#`P,$0P,#`P,#`P,#`P,#`P,#`P,#`P,#`P,
M#`P,#`S_P``1"``9`'`#`2(``A$!`Q$!_]T`!``'_\0!/P```04!`0$!`0$`
M`````````P`!`#0%!#<("0H+`0`!!0$!`0$!`0`````````!``(#!`4&!P#)
M"#L0``$$`0,"!`(%!P8(!0,,,P$``A$#!"$2,05!46$3(G&!,#84D:&Q0B;,D
M%5+!8C,T<H+10P)E\K.$P]-U
MX_-&)Y2DA;25Q-3D]*6UQ=7E]59F=H:6IK;&UN;V-T=79W>'EZ>WQ]?G]Q$`
M`#(!`#0$`P0%!#<'!#4U`0`"$0,A,1($05%A<2(3!3*!D12AL4(CP5+1\#,D
M8N%R#I)#4Q5C<S3Q)086HK*#!R8UPM)$DU2C%V1%539T9>+RLX3#TW7C\T:4
MI(6TE<34Y/2EM<75Y?569G:&EJ;:VQM;F]B
I would like to be left with just
GRAPHIC
18
g438975g32h99a01.jpg
For background, see also my earlier question How to remove weird encoding from txt file
EDIT : Here is a try
start_marker = 'begin 644'
with open('fileWithBegin644.txt') as inf:
ignoreLines = False
for line in inf:
if start_marker in line:
print line,
ignoreLines = True
if not ignoreLines:
with open("strip_" + inf, "w") as f:
f.write(line.get_text().encode('utf-8'))
But I am getting the following error
File "removeUuencodingFromAll.py", line 10
with open("strip_" + inf, "w") as f:
^
IndentationError: expected an indented block
I coded up what was supposed to be a rather simple generator. Because the spec is slightly tedious (why two separate end markers on different lines?) it is rather bulky, but here goes. It should work as a validator for uuencode at the same time, but I have only tested it in very limited settings.
import re
def unuuencode (iterator, collector=None, ignore_length_errors=False):
"""
Yield lines from iterator except when they are in an uuencode blob.
If collector is not None, append to it the uuencoded blobs as a list
of a list of lines, one for each uuencoded blob.
"""
state = None # one of { None, 'in_blob', 'closing', 'closed' }
collectitem = None
regex = re.compile(r'^begin\s+[0-7]{3,6}\s+.*?(?:\r?\n)?$')
for line in iterator:
if state == None:
if regex.match(line):
if collector != None:
collectitem = [line]
state = 'in_blob'
continue
else:
yield line
else:
stripped = line.rstrip('\r\n')
if state == 'in_blob' and line.startswith('`'):
state = 'closing'
if state == 'closing':
if stripped != '`':
raise ValueError('Expected "`" but got "%s"' % line)
state = 'closed'
elif state == 'closed':
if stripped != 'end':
raise ValueError('Expected "end" but got "%s"' % line)
state = None
else:
expect = ord(line[0:1])-32
actual = len(stripped)
seen = (len(stripped)-1)*6/8
if seen != expect:
if not ignore_length_errors:
raise ValueError('Wrong prefix on line: %s '
'(indicated %i, 6/8 %i, actual length %i)' % (
line, expect, seen, actual))
if line[0:1] != 'M':
state = 'closing'
if collectitem:
collectitem.append(line)
if state is None:
if collectitem:
collector.append(collectitem)
collectitem = None
continue
Use it like this:
with open(file, 'r') as f:
lines = [x for x in unuuencode(f)]
or like this:
with open(file, 'r') as f:
blobs = []
lines = [x for x in unuuencode(f, collector=blobs)]
or like this:
with open(file, 'r') as f:
lines = f.read().split('\n')
# ... or whichever way you obtained your content as an array of lines
lines = [x for x in unuuencode(lines)]
or in the case of the code you seem to be using:
for fi in sys.argv[1:]:
with open(fi) as markup:
soup = BeautifulSoup(''.join(unuuencode(markup, ignore_length_errors=True)))
with open("strip_" + fi, "w") as f:
f.write(soup.get_text().encode('utf-8'))
The sample you linked to had an invalid length indicator in the second uuencoded blob, so I added an option to ignore that.