In a file I have several lines with this structure:
> Present one time: "Instance: ...Edition: ..."
> Present two times: "Instance: ...Edition: ...Instance: ...Edition: ..."
> Present n times: "Instance: ...Edition: ... [n] Instance: ...Edition: ..."
This structure can appear once per line or several times on the same line. The idea is to read the file, line by line, isolate the values represented by ... and write them in an excel file. I can do it but I'm only able to isolate the values if the structure above is present one time in the line. If the structure is present more than once on the line, I can only save the values of the first structure.
This is my code:
#READ FILE
for i in fin:
if "Instance:" in i:
instance = ((i.split('Instance:'))[1].split('Edition')[0])
worksheet.write(row, col, instance)
if "Edition:" in i:
edition = ((i.split('Edition:'))[1].split('\n')[0])
worksheet.write(row, col, edition)
row += 1
Any idea how I could solve this problem?
Note that this only works if your input ends in an empty line (called newline).
If it does not you can add it like so: s += '\n'
s = '''Instance: A Edition: Limited
Instance: B Edition: Common Instance: C Edition: 2020 Instance: D Edition: Bla
'''
result = []
start_in = start_ed = None
for i in range(len(s)):
# Reaching the end of a data item
if s[i:i+9] == 'Instance:' or s[i] == '\n':
if start_in and start_ed:
result.append(
(s[start_in:start_ed-8].strip(), s[start_ed:i].strip())
)
start_in = start_ed = None
if s[i:i+9] == 'Instance:':
start_in = i+9
if s[i:i+8] == 'Edition:':
start_ed = i+8
print(result)
[('A', 'Limited'), ('B', 'Common'), ('C', '2020'), ('D', 'Bla')]
Edit: With Version field as requested
s = '''Instance: A Edition: Limited Version: 1
Instance: B Edition: Common Version: 2 Instance: C Edition: 2020 Version: 3 Instance: D Edition: Bla Version: 4
'''
result = []
start_in = start_ed = start_vs = None
for i in range(len(s)):
# Reaching the end of a data item
if s[i:i+9] == 'Instance:' or s[i] == '\n':
if start_in and start_ed and start_vs:
result.append((
s[start_in:start_ed-8].strip(),
s[start_ed:start_vs-8].strip(),
s[start_vs:i].strip()
))
start_in = start_ed = start_vs = None
if s[i:i+9] == 'Instance:':
start_in = i+9
if s[i:i+8] == 'Edition:':
start_ed = i+8
if s[i:i+8] == 'Version:':
start_vs = i+8
print(result)
Alternative solution using a regular expression. This is shorter but maybe harder to read and maintain:
import re
r = re.findall(r'Instance:([\w|\s]+?)Edition:([\w|\s]+?)(?=Instance|\n)', s)
[(' A ', ' Limited'), (' B ', ' Common '), (' C ', ' 2020 '), (' D ', ' Bla')]
If you don't want spaces around your matches you can either apply a strip to all elements like I did in my other solution, or you can modify the regex to read Instance: ([\w...
Related
I'm having trouble parsing a file. I have code that parses a file by the word Total: if its value is greater than 20.0 and returns the data. I need to change the search keyword to Tokens eth: with a value greater than 20.0 and output all data between separators ======== and additionally write all sorted values into sort.txt file. I would be grateful for professional help)
Code:
outlist = []
flag = False
def dump(list_, flag_):
if list_ and flag_:
print('\n'.join(list_))
return [], False
with open('total.txt') as file:
for line in map(str.strip, file):
if line.startswith('='):
outlist, flag = dump(outlist, flag)
else:
tokens = line.split()
if len(tokens) == 3 and tokens[1] == 'Total:':
try:
flag = float(tokens[2][:-1]) > 20.0
except ValueError:
pass
outlist.append(line)
dump(outlist, flag)
total.txt
============
| hafuia
| 0xb34a47885262f9d8673dc77de7b583961134f09fb03620b29d282c32ee6932be
| 0xD0b2612a6eE3111114b43b25322C6F08A251D38D
| Total: 47.62874464666479$
|
|
| Tokens eth:
| 20.608732$ MANA
|
| Protocols cro:
| 17.840052$ VVS Finance
| 8.953779$ V3S Finance
============
| asdf
| 0x72e164aa187feaff7cb28a74b7ff800a0dfe916594c70f141069669e9df5a23b
| 0xC7dFe558ed09F0f3b72eBb0A04e9d4e99af0bd0D
| Total: 22.908481672796988$
|
|
| Tokens eth:
| 22.376087$ SOS
============
| asdf
| 0xbce666bca3c862a2ee44651374f95aca677de16b4922c6d5e7d922cc0ac42a3d
| 0x5870923a244f52fF2D119fbf5525421E32EC006e
| Total: 9.077030269778557$
|
|
| Tokens eth:
| 8.942218$ SOS
============
This is how you can parse the file.
def parse_output(filename):
outlist = []
with open(filename) as file:
new_block = False
to_write = False
lines_arr = []
for line in map(str.strip, file):
if line.startswith('======='):
new_block = not new_block
if new_block:
if to_write:
outlist.append(lines_arr)
lines_arr = []
new_block = False
to_write = False
else:
lines_arr.append(line)
if 'Total:' in line:
num = float(line.split()[-1][:-1])
if num > 20:
to_write = True
return outlist
def write_output(outlist, filename):
for block in outlist:
for line in block:
with open(filename, 'a') as out_file:
out_file.write(line + '\n')
with open(filename, 'a') as out_file:
out_file.write('=======' + '\n')
if __name__ == '__main__':
write_output(parse_output('total.txt'), 'output.txt')
I missed the sorted wallet thing. For sorting, while appending array to outlist, you can use another array for order, or prepend the number to array, sort the outputs, and skip first element while writing.
This is written in such a way that it's easy to get fe. the addresses as well. sorting done with a simple lambda function.
from pprint import pprint
wallet_splitter = "============"
wallet_content_start = "Tokens eth:"
wallet_line_start = "|"
with open("totals.txt") as infile:
wallets = infile.read().split(wallet_splitter)
print(wallets)
wallets_above_20 = []
for wallet in wallets:
total = 0
separate = []
contents = False
for line in wallet.splitlines():
if wallet_content_start in line:
contents = True
elif contents:
if "$" in line:
separate.append(line.replace(wallet_line_start, "").split("$")[0])
total += float(separate[-1])
else:
contents = False
for amount in separate:
if float(amount) > 20:
wallets_above_20.append({
"total": total,
"data": wallet
})
pprint(sorted(wallets_above_20, key = lambda i: i['total'],reverse=True))
This is another simple extensible approach you can use to achieve what you need. The comments will explain the code.
# Create a simple representational object with data for every record.
class RateObject:
# You can change the delimiter to whatever you want.
def __init__(self, text_lines: list, delimiter="Tokens eth:"):
self.text_lines = text_lines
index = [i for i, x in enumerate(text_lines) if delimiter in x][0]
# Get the value from delimiter line
self.value = self._get_value(index)
# Override this method, to change the way you extract the value. From same line or different line etc.
def _get_value(self, delimiter_index: int):
# Case of Tokens eth:
value = self.text_lines[delimiter_index + 1]
value = value.strip()
# A bad parsing for numbers, can be improved may be!
number = "".join([x for x in value if x.isdigit() or x == "."])
if number:
return float(number)
else:
# Assume 0 for unknown values
return 0.0
def __str__(self):
# Return the lines as it is
return "".join(self.text_lines)
def __repr__(self):
return "".join(self.text_lines)
# read the source file
with open("src.txt", "r") as src:
line_texts = src.readlines()
# Split the lines into sections, using the delimiter ========
splitters = [index for index, text in enumerate(line_texts) if text == "============\n"]
# Create a list of RateObjects
raw_objects = [RateObject(lt) for lt in [line_texts[splitters[i]:splitters[i + 1]] for i in range(len(splitters) - 1)]]
# Filter the objects, to get only the ones with value > 20
selected_objects = list(filter(lambda x: x.value > 20.0, raw_objects))
# Sort the objects by value
sorted_objects = sorted(selected_objects, key=lambda x: x.value, reverse=True)
# print(selected_objects)
# print(sorted_objects)
# Write the sorted objects to a file
with open("sorted.txt", "w") as dst:
dst.write("\n".join([str(x) for x in sorted_objects]))
Here's a simple generator-based approach.
def items(file):
"""
Generator to yield items from filename
whose "Tokens eth:" is above 20.0
"""
with open(file) as lines:
item = []
tokens = 0
capture = False
for line in lines:
if line == "============\n":
if tokens > 20.0:
yield tokens, item
item = []
tokens = 0
continue
if capture:
tokens = float(line.strip().split()[-2].rstrip("$"))
capture = False
if line.startswith("| Tokens eth:"):
# Set flag to capture next line when we get to it
capture = True
item.append(line)
def main():
import sys
print("============")
for tokens, item in sorted(list(items(sys.argv[1]))):
print("".join(item), end="")
print("============")
if __name__ == "__main__":
main()
For simplicity, I made the generator also perform filtering, though it would be easy to remove items with a lower total on the caller's side if you wanted to make this reusable.
Demo: https://ideone.com/UKuC6C
In fact, I would recommend that you parse this haphazard file format just once, and convert it to a standard format like CSV or JSON for further processing if this is more than a one-off.
Using regular expressions from the re module of the standard library you can, for example, split the text into blocks enclosed by the separator, then find the amount of eth in each block, sort and finally filter them.
# parameters
total_txt = """from question"""
sorted_file_name = 'sort.txt'
THRESHOLD = 20.
as_dicreasing_order = False
# body
separators = re.finditer('='*12, total_txt)
separators = list(separators)
blocks = map(total_txt.__getitem__, [slice(m1.start(), m2.start()) for m1, m2 in zip(separators, separators[1:])])
amount_block_pairs = [(float(re.search(r'Tokens eth:\n\| (\d*\.\d*)\$', block, re.M).group(1)), block) for block in blocks]
# reverse=False for increasing order, True for the opposite
sorted_blocks = sorted(amount_block_pairs, reverse=as_dicreasing_order)
filtered_blocks = [block for amount, block in sorted_blocks if amount >= THRESHOLD]
with open(sorted_file_name, 'w') as fd:
fd.write(''.join(filtered_blocks))
One another option is to use python ttp template to parse your data. In the following code, it checks your total values, finds out the value lower than 20.0. Then, the code asks a value to enter which will replace with the Tokens eth: which is lower than 20.
from ttp import ttp
import json
with open('total.txt') as f:
data_to_parse = f.read()
ttp_template = '''
| Total: {{total}}$
| {{tokens_eth}}$ {{ignore}}
'''
parser = ttp(data=data_to_parse, template=ttp_template)
parser.parse()
# print result in JSON format
results = parser.result(format='json')[0]
#print(results)
#converting str to json.
result = json.loads(results)
# print(result)
for i in result[0]:
# print(i)
if float(i['total']) < 20:
new_tokens_eth = float(input(f"Total value is {i['total']} lower than 20. Enter a new 'Tokens eth:' value: "))
if i['tokens_eth'] in data_to_parse:
data_to_parse = data_to_parse.replace(i['tokens_eth'], str(new_tokens_eth))
print(data_to_parse)
See the parsed data:
See the output after the code is run.
I've below file in txt format
<START>
<SUBSTART
COLA=123;
COLB=123;
COLc=ABC;
COLc=BCD;
COLD=DEF;
<SUBEND
<SUBSTART
COLA=456;
COLB=456;
COLc=def;
COLc=def;
COLD=xyz;
<SUBEND
<SUBSTART
COLA=789;
COLB=789;
COLc=ghi;
COLc=ghi;
COLD=xyz;
<SUBEND>
<END>
Expected output,
COLA,COLB,COLc,COLc,COLD
123,123,ABC,BCD,DEF
456,456,def,def,xyz
789,789,ghi,ghi,xyz
how could I implement it in this python?
I've tried using dictionary, since it has repitative keys.that is not working.
You need to write a small parser for your custom format.
Here is a very naive a simple example (updated to handle an arbitrary number of duplicates):
from collections import Counter
out = []
add = False
for line in text.split('\n'): # here you could read from file instead
if line.startswith(' '): # restarting cycle
if not add:
out.append({})
c = Counter() # counter for duplicates
add = True
k,v = line.strip(' ;').split('=')
c[k] += 1
if c[k]>1: # found a duplicated column, adding suffix
k += f'_{c[k]-1}'
out[-1][k] = v
else:
add = False
df = pd.DataFrame(out)
input:
text = '''<SUBSTART
COLA=A;
COLB=B;
COLc=C;
COLc=D;
COLc=E;
COLD=F;
<SUBEND
<SUBSTART
COLA=G;
COLB=H;
COLc=I;
COLc=J;
COLc=K;
COLD=L;
<SUBSTART
COLA=M;
COLB=N;
COLc=O;
COLc=P;
COLc=Q;
COLD=R;
<SUBEND>
<END>'''
output:
COLA COLB COLc COLc_1 COLc_2 COLD
0 A B C D E F
1 G H I J K L
2 M N O P Q R
if not startswith("<"), save
if startswith('<SUBEND'), add new newline
import json
def main():
datas = []
with open('example.txt', 'r') as f:
sub_datas = []
for line in f.readlines():
if not line.startswith('<'):
items = line.strip()[:-1].split("=")
sub_datas.append({
items[0]: items[1]
})
elif line.startswith('<SUBEND'):
datas.append(sub_datas)
sub_datas = []
print(json.dumps(datas, indent=4))
if __name__ == '__main__':
main()
Now I'm still pretty new to python and programming in general, but I know I've gone a bit of a roundabout way this entire program. But here is what I have and what I want if anyone can help.
Begins by reading a text file e.g.
ADF101,Lecture,Monday,08:00,10:00,Jenolan,J10112,Joe Blo
ADF101,Tutorial,Thursday,10:00,11:00,Jenolan,J10115,Cat Blue
ALM204,Lecture,Monday,09:00,11:00,Tarana,T05201,Kim Toll
Then I make empty lists and append them with each index...
subjects = []
lecturer = []
for line in x:
f = line.split(',')
if len(fields) == 8:
subjects.append([0])
lecturer.append(f[7])
Then I provide an input that runs a function based on the input.
while True:
filter = input("Choose filter. Or [Q]uit: ")
if filter.upper() == 'S':
S(subjects)
break
elif filter.upper() == 'L':
L(lecturer)
break
Now if they choose L it runs this...
def L(lecturer):
print ("""
Lecturers
---------
""")
print (''.join(['[' + str(ind + 1) + '] ' + x for ind, x in enumerate(lecturer)]))
pick_lecturer = input("\npick a lecturer: ")
Which outputs like:
[1] Joe Blo
[2] Cat Blue
[3] Kim Toll
Here's where I'm stuck. I want to make it so that if the last
input is '1' it will read the file for each line with Joe Blo
and print the entire line. Without any external modules or libraries
Any guidance is appreciated. Thanks.
You can use csv module to read the file into a list. In this example, I read each row from the file into a namedtuple:
import csv
from collections import namedtuple
Item = namedtuple(
"Item", "subject type day time_from time_to some_column1 some_column2 name"
)
def L(data):
all_lecturers = list(set(d.name for d in data))
for i, name in enumerate(all_lecturers, 1):
print("[{}] {}".format(i, name))
while True:
try:
pick_lecturer = int(input("\npick a lecturer: "))
lecturer_name = all_lecturers[pick_lecturer - 1]
break
except:
continue
for d in [d for d in data if d.name == lecturer_name]:
print(d)
# 1. read the file
data = []
with open("your_file.txt", "r") as f_in:
reader = csv.reader(f_in)
for row in reader:
data.append(Item(*row))
# 2. choose a filter
while True:
filter_ = input("Choose filter. Or [Q]uit: ")
if filter_.upper() == "Q":
break
# if filter_.upper() == "S":
# S(data)
# break
elif filter_.upper() == "L":
L(data)
break
Prints:
Choose filter. Or [Q]uit: L
[1] Cat Blue
[2] Kim Toll
[3] Joe Blo
pick a lecturer: 3
Item(subject='ADF101', type='Lecture', day='Monday', time_from='08:00', time_to='10:00', some_column1='Jenolan', some_column2='J10112', name='Joe Blo')
May 1 00:00:00 date=2018-04-30 time=23:59:59 dev=A devid=1234 msg="test 1"
May 1 00:00:00 date=2018-04-31 time=00:00:01 dev=A devid=1234 msg="test 2"
Above is a sample of a log file that I am trying to convert into csv by checking letter by letter for = and save as a column value in a row.
I managed to capture columnValue if the value after the = is not a string.
Below is a part of the code that extracts the value. There is a part of the line where after =, there is a string with spaces in between. This broke the extract to start a new find. Is it possible to check the next letter for "\"" and then start saving letter by letter until the next "\"" so that I can save the Column Value as a string?
I'm using python 2.7
def outputCSV(log_file_path, outputCSVName, colValueSet):
data = []
f = open(log_file_path, "r")
values = set() # create empty set for all column values
content = f.readlines()
content = [x.strip() for x in content] #List of lines to iterate through
colValueSet.add("postingDate")
for line in content:
new_dict = dict.fromkeys(colValueSet, "")
new_dict["postingDate"]= line[0:16]
findingColHeader = True # we have to find the columns first
findingColValue = False # After column found, starting finding values
col_value = "" # Empty at first
value = "" # Empty value at first
start = False
for letter in line:
if findingColHeader:
if letter == " ":
# space means start taking in new value
# data is in this structure with space prior to column names -> " column=value"
start = True
col_value = ""
elif letter == "=":
findingColValue = True
start = False
findingColHeader = False
elif start:
col_value += letter
elif findingColValue:
if letter == " ":
new_dict[col_value] = value
value = ""
col_value = ""
findingColHeader = True
start = True
findingColValue = False
else:
value += letter
data += [new_dict]
with open(outputCSVName, 'wb') as csvfile:
fieldnames = list(colValueSet)
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in data:
writer.writerow(row)
print("Writing Complete")
# findColumnValues(a) would calculate all column value from the file path
outputCSV("ttest.log", "MyProcessedLog.csv", findColumnValues("test.log"))
you may try something like this:
>>> a = 'May 1 00:00:00 date=2018-04-30 time=23:59:59 dev=A devid=1234 msg="test 1" '
>>> a.split('=')
['May 1 00:00:00 date', '2018-04-30 time', '23:59:59 dev', 'A devid', '1234 msg', '"test 1" ']
>>> parts = a.split('=')
>>> b = []
>>> for i,j in zip(parts, parts[1:]) :
... b.append( (i[i.rfind(' ')+1:], j[:j.rfind(' ')]) )
...
>>> b
[('date', '2018-04-30'), ('time', '23:59:59'), ('dev', 'A'), ('devid', '1234'), ('msg', '"test 1"')]
>>>
I could make a cute one-liner, but I think this way it's easier to understand for you, when you see all intermediate results and can grasp the main idea -- split the line at = signs, use the last word as a keyword and the rest as the value.
You could use re module for Python (it's good to know it for any advanced text processing):
data = '''May 1 00:00:00 date=2018-04-30 time=23:59:59 dev=A devid=1234 msg="test 1"
May 1 00:00:00 date=2018-04-31 time=00:00:01 dev=A devid=1234 msg="test 2"'''
import re
for line in data.split('\n'):
print(re.findall(r'([^\s]+)=([^\s"]+|"[^"]+")', line))
Outputs:
[('date', '2018-04-30'), ('time', '23:59:59'), ('dev', 'A'), ('devid', '1234'), ('msg', '"test 1"')]
[('date', '2018-04-31'), ('time', '00:00:01'), ('dev', 'A'), ('devid', '1234'), ('msg', '"test 2"')]
The explanation of this regular pattern can be found here.
I have a txt file with the following tuple format
ABC-01 name1,10
DEF-02 name2,11
GHI-03 name3,12
JKH-04 name4,13
I may not be able to use import re. Need to do without re.
I need to split the tuples at the delimiters(ABC-01 and others are one word and I need to keep the hyphen). My output needs to be as follows
Format of the needed result
Out[]:
[(u'name1', u'ABC-01 10'),
(u'name2', u'DEF-02 11'),
(u'name3', u'GHI-03 12 '),
(u'name4', u'JKL-04 13')]
Here's what I have tried till now and the output I get
Solution 1:
def split_func(line):
line_mod = line.split(' ')
line_mod1 = line_mod.split(',')
print line_mod1
Result
Attribute Error : list object has no attribute split
Solution 2:
def split_func(line):
line_mod = line.split(' ')
a,b,c = str(line_mod).split(',')
return (b,a + " " + c)
Result
[(" u'name1", "[u'ABC-01' 10]"),
(" u'name2", "[u'DEF-02' 11]"),
(" u'name3", "[u'GHI-03' 12]"),
(" u'name4", "[u'JKL-04' 13]")]
How can I get the exact format that I am trying to get?
Here is a re example below.
import re
def main():
result = []
with open("test.txt") as f:
for line in f:
result.append(split_func(line.strip()))
print(result)
def split_func(line):
a, b, c = re.split("\s|,+", line)
return b, a + " " + c
if __name__ == '__main__':
main()
OR
Here is one without re
def main():
result = []
with open("test.txt") as f:
for line in f:
result.append(split_func(line.strip()))
print(result)
def split_func(line):
a, b = line.split(' ')
b, c = b.split(',')
return b, a + " " + c
if __name__ == '__main__':
main()
With the output looking like this
[('name1', 'ABC-01 10'), ('name2', 'DEF-02 11'), ('name3', 'GHI-03 12'), ('name4', 'JKH-04 13')]
You can do something like
def split_func(line):
a, b = line.split(' ')
c, d = b.split(',')
return c, ' '.join([a, d])
Your Solution 1 does not work because split() returns a list and you can't use split() on a list.
For Solution2
x = ['ab', 'cd']
str(x) gives "['ab', 'cd']"
What you need is join() function.