Save text elements separated by tab and commas in lists - python

I have the following text format in python:
126 attr1,attr7,attr4 and attr8
1 attr6,attr2,attr9,attr78,attr23,attr56,attr75,attr77
5 attr5,attr3,attr2
7 attr0
67 attr12,attr13,attr14
So i want to save the ids(126,1,5 etc) in a list and every line attributes to be saved in a list or dict. I saved the ids with the following code but i cant save the attributes. Here is my code:
file = open("myfile.txt","r")
lines = file.readlines()
nodes = []
skills = [] #or dict()
for x in lines:
nodes.append(x.split('\t')[0])
skills.append(x.split(',')[0]) #i want a list of lists or a dict with attrs

I think that this will do the trick:
for x in lines:
x = x.split('\t')
nodes.append(x[0])
skills.append(x[1].split(','))

I would rather advice to use a single dictionary with ids as its key and attr as a list of values:
d = {}
file = open("myfile.txt","r")
lines = file.readlines()
for line in lines:
splitted = line.split()
d.update({splitted[0]: splitted[1].split(',')})
print(d)
# {'126': ['attr1', 'attr7', 'attr4', 'attr8'],
# '1': ['attr6', 'attr2', 'attr9', 'attr78', 'attr23', 'attr56', 'attr75', 'attr77'],
# '5': ['attr5', 'attr3', 'attr2'],
# '7': ['attr0'],
# '67': ['attr12', 'attr13', 'attr14']}

If you want all attributes as joined string then :
attr_dict={}
with open('file.txt','r') as f:
for line in f:
attr_dict[line.split()[0]]=line.split()[1:]
print(attr_dict)
output:
{'126': ['attr1,attr7,attr4', 'and', 'attr8'], '7': ['attr0'], '5': ['attr5,attr3,attr2'], '1': ['attr6,attr2,attr9,attr78,attr23,attr56,attr75,attr77'], '67': ['attr12,attr13,attr14']}
If you want indivisual element then :
attr_dict={}
with open('file.txt','r') as f:
for line in f:
data=line.split()
for sub_data in data:
attr_dict[line.split()[0]]=sub_data.split(',')
print(attr_dict)
output:
{'126': ['attr8'], '7': ['attr0'], '5': ['attr5', 'attr3', 'attr2'], '1': ['attr6', 'attr2', 'attr9', 'attr78', 'attr23', 'attr56', 'attr75', 'attr77'], '67': ['attr12', 'attr13', 'attr14']}

Related

add values to a list from specific part of a text file

I am having this text
/** Goodmorning
Alex
Dog
House
Red
*/
/** Goodnight
Maria
Cat
Office
Green
*/
I would like to have Alex , Dog , House and red in one list and Maria,Cat,office,green in an other list.
I am having this code
with open(filename) as f :
for i in f:
if i.startswith("/** Goodmorning"):
#add files to list
elif i.startswith("/** Goodnight"):
#add files to other list
So, is there any way to write the script so it can understands that Alex belongs in the part of the text that has Goodmorning?
I'd recommend you to use dict, where "section name" will be a key:
with open(filename) as f:
result = {}
current_list = None
for line in f:
if line.startswith("/**"):
current_list = []
result[line[3:].strip()] = current_list
elif line != "*/":
current_list.append(line.strip())
Result:
{'Goodmorning': ['Alex', 'Dog', 'House', 'Red'], 'Goodnight': ['Maria', 'Cat', 'Office', 'Green']}
To search which key one of values belongs you can use next code:
search_value = "Alex"
for key, values in result.items():
if search_value in values:
print(search_value, "belongs to", key)
break
I would recommend to use Regular expressions. In python there is a module for this called re
import re
s = """/** Goodmorning
Alex
Dog
House
Red
*/
/** Goodnight
Maria
Cat
Office
Green
*/"""
pattern = r'/\*\*([\w \n]+)\*/'
word_groups = re.findall(pattern, s, re.MULTILINE)
d = {}
for word_group in word_groups:
words = word_group.strip().split('\n\n')
d[words[0]] = words[1:]
print(d)
Output:
{'Goodmorning': ['Alex', 'Dog', 'House', 'Red'], 'Goodnight':
['Maria', 'Cat', 'Office', 'Green']}
expanding on Olvin Roght (sorry can't comment - not enough reputation) I would keep a second dictionary for the reverse lookup
with open(filename) as f:
key_to_list = {}
name_to_key = {}
current_list = None
current_key = None
for line in f:
if line.startswith("/**"):
current_list = []
current_key = line[3:].strip()
key_to_list[current_key] = current_list
elif line != "*/":
current_name=line.strip()
name_to_key[current_name]=current_key
current_list.append(current_name)
print key_to_list
print name_to_key['Alex']
alternative is to convert the dictionary afterwards:
name_to_key = {n : k for k in key_to_list for n in key_to_list[k]}
(i.e if you want to go with the regex version from ashwani)
Limitation is that this only permits one membership per name.

Parse vertical text in a file with repeated block

What is the best way for parsing below file? The blocks repeat multiple times.
The expected result is output to CSV file as:
{Place: REGION-1, Host: ABCD, Area: 44...}
I tried the code below, but it only iterates first blocks and than finishes.
with open('/tmp/t2.txt', 'r') as input_data:
for line in input_data:
if re.findall('(.*_RV)\n',line):
myDict={}
myDict['HOST'] = line[6:]
continue
elif re.findall('Interface(.*)\n',line):
myDict['INTF'] = line[6:]
elif len(line.strip()) == 0:
print(myDict)
Text file is below.
Instance REGION-1:
ABCD_RV
Interface: fastethernet01/01
Last state change: 0h54m44s ago
Sysid: 01441
Speaks: IPv4
Topologies:
ipv4-unicast
SAPA: point-to-point
Area Address(es):
441
IPv4 Address(es):
1.1.1.1
EFGH_RV
Interface: fastethernet01/01
Last state change: 0h54m44s ago
Sysid: 01442
Speaks: IPv4
Topologies:
ipv4-unicast
SAPA: point-to-point
Area Address(es):
442
IPv4 Address(es):
1.1.1.2
Instance REGION-2:
IJKL_RV
Interface: fastethernet01/01
Last state change: 0h54m44s ago
Sysid: 01443
Speaks: IPv4
Topologies:
ipv4-unicast
SAPA: point-to-point
Area Address(es):
443
IPv4 Address(es):
1.1.1.3
Or if you prefer an ugly regex route:
import re
region_re = re.compile("^Instance\s+([^:]+):.*")
host_re = re.compile("^\s+(.*?)_RV.*")
interface_re = re.compile("^\s+Interface:\s+(.*?)\s+")
other_re = re.compile("^\s+([^\s]+).*?:\s+([^\s]*){0,1}")
myDict = {}
extra = None
with open('/tmp/t2.txt', 'r') as input_data:
for line in input_data:
if extra: # value on next line from key
myDict[extra] = line.strip()
extra = None
continue
region = region_re.match(line)
if region:
if len(myDict) > 1:
print(myDict)
myDict = {'Place': region.group(1)}
continue
host = host_re.match(line)
if host:
if len(myDict) > 1:
print(myDict)
myDict = {'Place': myDict['Place'], 'Host': host.group(1)}
continue
interface = interface_re.match(line)
if interface:
myDict['INTF'] = interface.group(1)
continue
other = other_re.match(line)
if other:
groups = other.groups()
if groups[1]:
myDict[groups[0]] = groups[1]
else:
extra = groups[0]
# dump out final one
if len(myDict) > 1:
print(myDict)
output:
{'Place': 'REGION-1', 'Host': 'ABCD', 'INTF': 'fastethernet01/01', 'Last': '0h54m44s', 'Sysid': '01441', 'Speaks': 'IPv4', 'Topologies': 'ipv4-unicast', 'SAPA': 'point-to-point', 'Area': '441', 'IPv4': '1.1.1.1'}
{'Place': 'REGION-1', 'Host': 'EFGH', 'INTF': 'fastethernet01/01', 'Last': '0h54m44s', 'Sysid': '01442', 'Speaks': 'IPv4', 'Topologies': 'ipv4-unicast', 'SAPA': 'point-to-point', 'Area': '442', 'IPv4': '1.1.1.2'}
{'Place': 'REGION-2', 'Host': 'IJKL', 'INTF': 'fastethernet01/01', 'Last': '0h54m44s', 'Sysid': '01443', 'Speaks': 'IPv4', 'Topologies': 'ipv4-unicast', 'SAPA': 'point-to-point', 'Area': '443', 'IPv4': '1.1.1.3'}
This doesn't use much regex and could be more optimized. Hope it helps!
import re
import pandas as pd
from collections import defaultdict
_level_1 = re.compile(r'instance region.*', re.IGNORECASE)
with open('stack_formatting.txt') as f:
data = f.readlines()
"""
Format data so that it could be split easily
"""
data_blocks = defaultdict(lambda: defaultdict(str))
header = None
instance = None
for line in data:
line = line.strip()
if _level_1.match(line):
header = line
else:
if "_RV" in line:
instance = line
elif not line.endswith(":"):
data_blocks[header][instance] += line + ";"
else:
data_blocks[header][instance] += line
def parse_text(data_blocks):
"""
Generate a dict which could be converted easily to a pandas dataframe
:param data_blocks: splittable data
:return: dict with row values for every column
"""
final_data = defaultdict(list)
for key1 in data_blocks.keys():
for key2 in data_blocks.get(key1):
final_data['instance'].append(key1)
final_data['sub_instance'].append(key2)
for items in data_blocks[key1][key2].split(";"):
print(items)
if items.isspace() or len(items) == 0:
continue
a,b = re.split(r':\s*', items)
final_data[a].append(b)
return final_data
print(pd.DataFrame(parse_text(data_blocks)))
This worked for me but it's not pretty:
text=input_data
text=text.rstrip(' ').rstrip('\n').strip('\n')
#first I get ready to create a csv by replacing the headers for the data
text=text.replace('Instance REGION-1:',',')
text=text.replace('Instance REGION-2:',',')
text=text.replace('Interface:',',')
text=text.replace('Last state change:',',')
text=text.replace('Sysid:',',')
text=text.replace('Speaks:',',')
text=text.replace('Topologies:',',')
text=text.replace('SAPA:',',')
text=text.replace('Area Address(es):',',')
text=text.replace('IPv4 Address(es):',',')
#now I strip out the leading whitespace, cuz it messes up the split on '\n\n'
lines=[x.lstrip(' ') for x in text.split('\n')]
clean_text=''
#now that the leading whitespace is gone I recreate the text file
for line in lines:
clean_text+=line+'\n'
#Now split the data into groups based on single entries
entries=clean_text.split('\n\n')
#create one liners out of the entries so they can be split like csv
entry_lines=[x.replace('\n',' ') for x in entries]
#create a dataframe to hold the data for each line
df=pd.DataFrame(columns=['Instance REGION','Interface',
'Last state change','Sysid','Speaks',
'Topologies','SAPA','Area Address(es)',
'IPv4 Address(es)']).T
#now the meat and potatoes
count=0
for line in entry_lines:
data=line[1:].split(',') #split like a csv on commas
data=[x.lstrip(' ').rstrip(' ') for x in data] #get rid of extra leading/trailing whitespace
df[count]=data #create an entry for each split
count+=1 #incriment the count
df=df.T #transpose back to normal so it doesn't look weird
Output looks like this for me
Edit: Also, since you have various answers here, I test the performance of mine. It is mildly exponential as described by the equation y = 100.97e^(0.0003x)
Here are my timeit results.
Entries Milliseconds
18 49
270 106
1620 394
178420 28400

How do I read a multi-line list from a file in Python?

I have a file which has lists spanned across multiple lines - with the length of the lists as constant. However, in each line the number of elements can varied.
How do I read this file in Python to read each list as a whole?
Edit: Would prefer a non-regex solution.
The file which looks something like this (just for illustration):
[ -6.70031086e-02 5.93684241e-02 1.11689426e-01 1.16174825e-01
-3.74981388e-02 4.05267589e-02 2.02941950e-02 1.65661901e-01
9.88883078e-02 -1.86108038e-01 -2.09761858e-01 2.08867267e-02
-7.34964982e-02 -1.38626635e-01 1.33853648e-02 -1.11527992e-02
7.19301552e-02 5.71861453e-02 -8.56672525e-02 8.01878721e-02
-2.27990234e-03 8.93531218e-02 -7.99949542e-02 -3.89122330e-02
3.07365637e-02 -1.14912149e-02 -1.25382066e-01 1.61550958e-02
-9.03828740e-02 -8.40659663e-02 2.35458408e-02 6.62269741e-02
-6.83306251e-03 3.86000201e-02 -2.85124127e-02 -1.22550033e-01
6.14493713e-02 5.42194061e-02 -9.98141840e-02 3.87526527e-02
-1.77935660e-02 6.59185136e-03 -7.56490007e-02 -8.04342143e-03
4.22548652e-02 -4.90937680e-02 7.31833130e-02 4.60098870e-02
-3.38455513e-02 7.72312284e-02 1.69506043e-01 8.54071528e-02
-5.15969582e-02 -8.66574422e-02 2.78513003e-02 -8.26551542e-02
5.72918989e-02 -8.63238499e-02 -1.09750973e-02 -1.04178898e-01
4.04170994e-03 7.16830865e-02 1.16529778e-01 1.65875465e-01
1.82720050e-02 1.71985731e-01 -2.09263922e-03 -3.31376195e-02
1.26107544e-01 1.47209521e-02 -1.41869476e-02 5.07163629e-02
1.49011686e-01 9.49593708e-02 4.67912182e-02 -8.64533633e-02
4.12282310e-02 8.19735080e-02 1.49312839e-02 2.14010417e-01
1.43005610e-01 -6.68876693e-02 1.25497788e-01 -8.12855735e-02
1.89039335e-02 -7.57512003e-02 4.25233506e-02 -6.90079033e-02
8.08808357e-02 -3.47024412e-03 2.63141114e-02 1.61882326e-01
1.25483396e-02 1.45484000e-01 3.12147997e-02 5.61049813e-03
-1.52215753e-02 -9.00566354e-02 7.78550655e-02 2.32269196e-03
6.35183901e-02 -1.34039536e-01 1.12368152e-01 -5.65479957e-02
-1.40751451e-01 -3.24242609e-03 -2.60595884e-02 -3.79961394e-02
9.53520015e-02 1.18161231e-01 -6.31203428e-02 6.54687434e-02
-8.70579779e-02 1.64551754e-02 -4.66874018e-02 -2.02252846e-02
1.81142420e-01 -4.29894254e-02 8.62734243e-02 -1.96067482e-01
-5.18136062e-02 -1.02697751e-02 -8.20104256e-02 -7.04407394e-02
-1.37479603e-01 1.51444465e-01 1.46553725e-01 6.87731877e-02]
[ 0.13552369 -0.05061625 0.13381879 -0.09299553 -0.10647763 -0.02260791
0.00843107 0.01909993 0.0252617 -0.09204189 0.11444099 0.16380875
-0.26470438 0.04185624 0.08701419 -0.00960395 0.03196884 0.05695887
0.03903539 0.0330128 0.0088141 0.03981387 -0.2256397 0.1373885
-0.00823926 -0.23756374 0.14071368 0.15679301 0.05020505 0.00083234
0.14197688 -0.17108534 -0.03471961 -0.09328505 0.04228394 0.07565336
-0.06243521 -0.09347741 -0.00821514 -0.06649745 0.05205032 -0.00554045
-0.00386953 0.05514322 -0.0234912 -0.11922046 0.14259741 -0.04250529
0.02933454 0.09837652 -0.04943179 -0.01795183 0.11347186 -0.0262726
0.14694421 0.00120262 0.02876565 0.06762701 -0.06783341 -0.0130248
0.0304249 0.04527348 0.15238339 0.01605285 0.02574495 0.03512112
-0.05733667 -0.09585288 0.05414675 0.14885603 -0.02176115 -0.11798949
0.10624658 0.04126133 0.0355645 -0.0176413 0.01316 -0.0731855
0.06095812 -0.03693416 0.05717857 -0.06640249 0.02760602 -0.11397229
-0.08891453 -0.05422837 -0.00309273 -0.08528782 0.04416328 0.10460843
0.08477673 -0.03460682 0.26425052 0.027636 -0.01395808 -0.04762371
-0.11365297 -0.09291256 0.02920797 0.1462263 -0.1354932 -0.00904074
0.16209167 -0.0351855 0.0287815 0.082674 0.03369482 -0.04522609
0.01189264 -0.03094579 -0.1829372 -0.0331573 0.03074961 -0.01479802
-0.06882931 -0.02879945 0.04064524 0.1048708 0.11631119 -0.13730904
-0.01107442 0.07329052 0.013919 0.02282012 0.14160685 -0.08278389
0.04416744 0.17811519 0.06306098 -0.15048456 -0.08337893 0.06718753
0.02712255 0.0626005 0.05940831 0.08399926 0.22958109 -0.06148282
-0.05348093 -0.05489948 0.18494032 -0.01777483 0.03008986 0.03045709
-0.09592026 0.17701676 -0.21119906 -0.01997624 0.15930974 -0.03315869 ]
import re
p=re.compile(r'\[.*\]', re.S)
num=re.compile(r'\S+')
f=open("lst", "r")
s=f.read()
f.close()
l=p.findall(s)
lst=[]
for i in l:
tmp=[]
num_list=num.findall(i)
del num_list[0]
for n in num_list:
if n!=']':
tmp.append(n)
lst.append(tmp)
print lst
lst is a list of lists read from your file.
You don't need a regex, just strip and split, mapping to float if you want floats:
def sections():
with open("in.txt") as f:
tmp = []
for line in f:
data = list(map(float, line.strip(" []\n").split()))
if line.rstrip().endswith("]"):
yield tmp
tmp = []
tmp.append(data)
from pprint import pprint as pp
pp(list(sections()))
Output:
[[[-0.0670031086, 0.0593684241, 0.111689426, 0.116174825],
[-0.0374981388, 0.0405267589, 0.020294195, 0.165661901],
[0.0988883078, -0.186108038, -0.209761858, 0.0208867267],
[-0.0734964982, -0.138626635, 0.0133853648, -0.0111527992],
[0.0719301552, 0.0571861453, -0.0856672525, 0.0801878721],
[-0.00227990234, 0.0893531218, -0.0799949542, -0.038912233],
[0.0307365637, -0.0114912149, -0.125382066, 0.0161550958],
[-0.090382874, -0.0840659663, 0.0235458408, 0.0662269741],
[-0.00683306251, 0.0386000201, -0.0285124127, -0.122550033],
[0.0614493713, 0.0542194061, -0.099814184, 0.0387526527],
[-0.017793566, 0.00659185136, -0.0756490007, -0.00804342143],
[0.0422548652, -0.049093768, 0.073183313, 0.046009887],
[-0.0338455513, 0.0772312284, 0.169506043, 0.0854071528],
[-0.0515969582, -0.0866574422, 0.0278513003, -0.0826551542],
[0.0572918989, -0.0863238499, -0.0109750973, -0.104178898],
[0.00404170994, 0.0716830865, 0.116529778, 0.165875465],
[0.018272005, 0.171985731, -0.00209263922, -0.0331376195],
[0.126107544, 0.0147209521, -0.0141869476, 0.0507163629],
[0.149011686, 0.0949593708, 0.0467912182, -0.0864533633],
[0.041228231, 0.081973508, 0.0149312839, 0.214010417],
[0.14300561, -0.0668876693, 0.125497788, -0.0812855735],
[0.0189039335, -0.0757512003, 0.0425233506, -0.0690079033],
[0.0808808357, -0.00347024412, 0.0263141114, 0.161882326],
[0.0125483396, 0.145484, 0.0312147997, 0.00561049813],
[-0.0152215753, -0.0900566354, 0.0778550655, 0.00232269196],
[0.0635183901, -0.134039536, 0.112368152, -0.0565479957],
[-0.140751451, -0.00324242609, -0.0260595884, -0.0379961394],
[0.0953520015, 0.118161231, -0.0631203428, 0.0654687434],
[-0.0870579779, 0.0164551754, -0.0466874018, -0.0202252846],
[0.18114242, -0.0429894254, 0.0862734243, -0.196067482],
[-0.0518136062, -0.0102697751, -0.0820104256, -0.0704407394]],
[[-0.137479603, 0.151444465, 0.146553725, 0.0687731877],
[0.13552369,
-0.05061625,
0.13381879,
-0.09299553,
-0.10647763,
-0.02260791],
[0.00843107, 0.01909993, 0.0252617, -0.09204189, 0.11444099, 0.16380875],
[-0.26470438, 0.04185624, 0.08701419, -0.00960395, 0.03196884, 0.05695887],
[0.03903539, 0.0330128, 0.0088141, 0.03981387, -0.2256397, 0.1373885],
[-0.00823926, -0.23756374, 0.14071368, 0.15679301, 0.05020505, 0.00083234],
[0.14197688, -0.17108534, -0.03471961, -0.09328505, 0.04228394, 0.07565336],
[-0.06243521,
-0.09347741,
-0.00821514,
-0.06649745,
0.05205032,
-0.00554045],
[-0.00386953, 0.05514322, -0.0234912, -0.11922046, 0.14259741, -0.04250529],
[0.02933454, 0.09837652, -0.04943179, -0.01795183, 0.11347186, -0.0262726],
[0.14694421, 0.00120262, 0.02876565, 0.06762701, -0.06783341, -0.0130248],
[0.0304249, 0.04527348, 0.15238339, 0.01605285, 0.02574495, 0.03512112],
[-0.05733667,
-0.09585288,
0.05414675,
0.14885603,
-0.02176115,
-0.11798949],
[0.10624658, 0.04126133, 0.0355645, -0.0176413, 0.01316, -0.0731855],
[0.06095812, -0.03693416, 0.05717857, -0.06640249, 0.02760602, -0.11397229],
[-0.08891453,
-0.05422837,
-0.00309273,
-0.08528782,
0.04416328,
0.10460843],
[0.08477673, -0.03460682, 0.26425052, 0.027636, -0.01395808, -0.04762371],
[-0.11365297, -0.09291256, 0.02920797, 0.1462263, -0.1354932, -0.00904074],
[0.16209167, -0.0351855, 0.0287815, 0.082674, 0.03369482, -0.04522609],
[0.01189264, -0.03094579, -0.1829372, -0.0331573, 0.03074961, -0.01479802],
[-0.06882931, -0.02879945, 0.04064524, 0.1048708, 0.11631119, -0.13730904],
[-0.01107442, 0.07329052, 0.013919, 0.02282012, 0.14160685, -0.08278389],
[0.04416744, 0.17811519, 0.06306098, -0.15048456, -0.08337893, 0.06718753],
[0.02712255, 0.0626005, 0.05940831, 0.08399926, 0.22958109, -0.06148282],
[-0.05348093, -0.05489948, 0.18494032, -0.01777483, 0.03008986, 0.03045709]]]
If you are storing the arrays, you might consider using numpy.save or pickle etc.. Storing in the current format is probably not the best idea.
Here is a another solution:
file = open('database.txt', 'r')
text = file.read()
file.close()
## long version
lists = text.split(']')
lists = lists[:-1] # remove last element which is empty (because of split)
lists = [i.strip() for i in lists] # remove possible spaces and tabs
lists = [i.strip('[') for i in lists] # remove '[' that is left on beginning of every element
lists = [i.split() for i in lists] # split every element to get list
lists = [[float(j) for j in i] for i in lists] # convert lists of strings to lists of numbers
print(lists) # result is list of lists
## short version
lists = [[float(j) for j in i.strip().strip('[').split()] for i in text.split(']')[:-1]]
print(lists)
f=open('sample.txt','r')
y=[]
for a in f:
b=a.split()
for c in b:
if c[0]=='[':
d=c[1:]
elif c[-1]==']':
d=c[:-1]
else:
d=c
y.append(d)
f.close()
print y

How to read a file block-wise in python

I am bit stuck in reading a file block-wise, and facing difficulty in getting some selective data in each block :
Here is my file content :
DATA.txt
#-----FILE-----STARTS-----HERE--#
#--COMMENTS CAN BE ADDED HERE--#
BLOCK IMPULSE DATE 01-JAN-2010 6 DEHDUESO203028DJE \
SEQUENCE=ai=0:at=221:ae=3:lu=100:lo=NNU:ei=1021055:lr=1: \
USERID=ID=291821 NO_USERS=3 GROUP=ONE id_info=1021055 \
CREATION_DATE=27-JUNE-2013 SN=1021055 KEY ="22WS \
DE34 43RE ED54 GT65 HY67 AQ12 ES23 54CD 87BG 98VC \
4325 BG56"
BLOCK PASSION DATE 01-JAN-2010 6 DEHDUESO203028DJE \
SEQUENCE=ai=0:at=221:ae=3:lu=100:lo=NNU:ei=324356:lr=1: \
USERID=ID=291821 NO_USERS=1 GROUP=ONE id_info=324356 \
CREATION_DATE=27-MAY-2012 SN=324356 KEY ="22WS \
DE34 43RE 342E WSEW T54R HY67 TFRT 4ER4 WE23 XS21 \
CD32 12QW"
BLOCK VICTOR DATE 01-JAN-2010 6 DEHDUESO203028DJE \
SEQUENCE=ai=0:at=221:ae=3:lu=100:lo=NNU:ei=324356:lr=1: \
USERID=ID=291821 NO_USERS=5 GROUP=ONE id_info=324356 \
CREATION_DATE=27-MAY-2012 SN=324356 KEY ="22WS \
DE34 43RE 342E WSEW T54R HY67 TFRT 4ER4 WE23 XS21 \
CD32 12QW"
#--BLOCK--ENDS--HERE#
#--NEW--BLOCKS--CAN--BE--APPENDED--HERE--#
I am only interested in Block Name , NO_USERS, and id_info of each block .
these three data to be saved to a data-structure(lets say dict), which is further stored in a list :
[{Name: IMPULSE ,NO_USER=3,id_info=1021055},{Name: PASSION ,NO_USER=1,id_info=324356}. . . ]
any other data structure which can hold the info would also be fine.
So far i have tried getting the block names by reading line by line :
fOpen = open('DATA.txt')
unique =[]
for row in fOpen:
if "BLOCK" in row:
unique.append(row.split()[1])
print unique
i am thinking of regular expression approach, but i have no idea where to start with.
Any help would be appreciate.Meanwhile i am also trying , will update if i get something . Please help .
You could use groupy to find each block, use a regex to extract the info and put the values in dicts:
from itertools import groupby
import re
with open("test.txt") as f:
data = []
# find NO_USERS= 1+ digits or id_info= 1_ digits
r = re.compile("NO_USERS=\d+|id_info=\d+")
grps = groupby(f,key=lambda x:x.strip().startswith("BLOCK"))
for k,v in grps:
# if k is True we have a block line
if k:
# get name after BLOCK
name = next(v).split(None,2)[1]
# get lines after BLOCK and get the second of those
t = next(grps)[1]
# we want two lines after BLOCK
_, l = next(t), next(t)
d = dict(s.split("=") for s in r.findall(l))
# add name to dict
d["Name"] = name
# add sict to data list
data.append(d)
print(data)
Output:
[{'NO_USERS': '3', 'id_info': '1021055', 'Name': 'IMPULSE'},
{'NO_USERS': '1', 'id_info': '324356', 'Name': 'PASSION'},
{'NO_USERS': '5', 'id_info': '324356', 'Name': 'VICTOR'}]
Or without groupby as your file follows a format we just need to extract the second line after the BLOCK line:
with open("test.txt") as f:
data = []
r = re.compile("NO_USERS=\d+|id_info=\d+")
for line in f:
# if True we have a new block
if line.startswith("BLOCK"):
# call next twice to get thw second line after BLOCK
_, l = next(f), next(f)
# get name after BLOCK
name = line.split(None,2)[1]
# find our substrings from l
d = dict(s.split("=") for s in r.findall(l))
d["Name"] = name
data.append(d)
print(data)
Output:
[{'NO_USERS': '3', 'id_info': '1021055', 'Name': 'IMPULSE'},
{'NO_USERS': '1', 'id_info': '324356', 'Name': 'PASSION'},
{'NO_USERS': '5', 'id_info': '324356', 'Name': 'VICTOR'}]
To extract values you can iterate:
for dct in data:
print(dct["NO_USERS"])
Output:
3
1
5
If you want a dict of dicts and to access each section from 1-n you can store as nested dicts using from 1-n as tke key:
from itertools import count
import re
with open("test.txt") as f:
data, cn = {}, count(1)
r = re.compile("NO_USERS=\d+|id_info=\d+")
for line in f:
if line.startswith("BLOCK"):
_, l = next(f), next(f)
name = line.split(None,2)[1]
d = dict(s.split("=") for s in r.findall(l))
d["Name"] = name
data[next(cn)] = d
data["num_blocks"] = next(cn) - 1
Output:
from pprint import pprint as pp
pp(data)
{1: {'NO_USERS': '3', 'Name': 'IMPULSE', 'id_info': '1021055'},
2: {'NO_USERS': '1', 'Name': 'PASSION', 'id_info': '324356'},
3: {'NO_USERS': '5', 'Name': 'VICTOR', 'id_info': '324356'},
'num_blocks': 3}
'num_blocks' will tell you exactly how many blocks you extracted.

How to store CSV data in a Nested Dictionary that has a dictionary and a list?

I have the following CSV Data,
Rule1,Status1,1
Rule1,Status2,1
Rule1,Status3,1
Rule1,Status4,2
Rule2,Status1,2
Rule2,Status2,1
Rule2,Status3,1
Rule2,Status4,3
I have unique rules (first column) stored in a list called Rules. I want my dictionary to look like the following:
DictionaryFull = {
'Rule1' : {1 : [Status1, Status2, Status3], 2 : [Status4]},
'Rule2' : {1 : [Status2, Status3], 2 : [Status1], 3 : [Status4]}
}
Here is what I tried:
openfile = ('data.csv', 'rU')
finalfile = csv.reader(openfile, delimiter=',')
FullDictionary = {}
for row in finalfile:
for j in range (0, 300): #300 number of rules
if Rules[j] not in FullDictionary:
for i in range(1, 71): #These are third column numbers 1 - 71
if i == int(row[2]) and row[0] == Rules[j]:
FullDictionary = {Rules[j] : { i : [].append[row[1]}}
print FullDictionary
But I am getting the following as the result:
{'Rule1': {1 : None}} and so on
Am I doing something wrong? How to accomplish this task of having a dictionary with both another dictionary and a list.
I tried this:
def something():
full_dictionary = {}
with open(DataFilePath) as f:
reader = csv.reader(f)
for row in reader:
rule = row[2], status = row[0], num = int(row[5])
r = full_dictionary.setdefault(rule, {})
r.setdefault(num, []).append(status)
print full_dictionary
The error: ValueError: I/O operation on closed file
Hwo about using collection.defaultdict:
import csv
from collections import defaultdict
full_dictionary = defaultdict(lambda: defaultdict(list))
with open('data.csv') as f:
reader = csv.reader(f)
for rule, status, num in reader:
full_dictionary[rule][num].append(status)
print full_dictionary
output:
defaultdict(<function <lambda> at 0x00000000025A6438>, {
'Rule2': defaultdict(<type 'list'>, {
'1': ['Status2', 'Status3'],
'3': ['Status4'],
'2': ['Status1']
}),
'Rule1': defaultdict(<type 'list'>, {
'1': ['Status1', 'Status2', 'Status3'],
'2': ['Status4']
})
})
If you don't want to use defaultdict, you have to care new key.
For example, using dict.setdefault:
import csv
full_dictionary = {}
with open('data.csv') as f:
reader = csv.reader(f)
for rule, status, num in reader:
r = full_dictionary.setdefault(rule, {})
r.setdefault(num, []).append(status)
print full_dictionary
output:
{'Rule1': {'1': ['Status1', 'Status2', 'Status3'], '2': ['Status4']},
'Rule2': {'1': ['Status2', 'Status3'], '2': ['Status1'], '3': ['Status4']}}
list.append returns None, so your assignment Rules[j] = [].append([row[1]) is setting Rules[j] = None.
Amend that to:
FullDictionary = {Rules[j] : { i : [row[1]}}
or
old_value = Rules[j].get(i, [])
old_value.append(row[1])
depending on what you're wishing to achieve.

Categories

Resources