I've below file in txt format
<START>
<SUBSTART
COLA=123;
COLB=123;
COLc=ABC;
COLc=BCD;
COLD=DEF;
<SUBEND
<SUBSTART
COLA=456;
COLB=456;
COLc=def;
COLc=def;
COLD=xyz;
<SUBEND
<SUBSTART
COLA=789;
COLB=789;
COLc=ghi;
COLc=ghi;
COLD=xyz;
<SUBEND>
<END>
Expected output,
COLA,COLB,COLc,COLc,COLD
123,123,ABC,BCD,DEF
456,456,def,def,xyz
789,789,ghi,ghi,xyz
how could I implement it in this python?
I've tried using dictionary, since it has repitative keys.that is not working.
You need to write a small parser for your custom format.
Here is a very naive a simple example (updated to handle an arbitrary number of duplicates):
from collections import Counter
out = []
add = False
for line in text.split('\n'): # here you could read from file instead
if line.startswith(' '): # restarting cycle
if not add:
out.append({})
c = Counter() # counter for duplicates
add = True
k,v = line.strip(' ;').split('=')
c[k] += 1
if c[k]>1: # found a duplicated column, adding suffix
k += f'_{c[k]-1}'
out[-1][k] = v
else:
add = False
df = pd.DataFrame(out)
input:
text = '''<SUBSTART
COLA=A;
COLB=B;
COLc=C;
COLc=D;
COLc=E;
COLD=F;
<SUBEND
<SUBSTART
COLA=G;
COLB=H;
COLc=I;
COLc=J;
COLc=K;
COLD=L;
<SUBSTART
COLA=M;
COLB=N;
COLc=O;
COLc=P;
COLc=Q;
COLD=R;
<SUBEND>
<END>'''
output:
COLA COLB COLc COLc_1 COLc_2 COLD
0 A B C D E F
1 G H I J K L
2 M N O P Q R
if not startswith("<"), save
if startswith('<SUBEND'), add new newline
import json
def main():
datas = []
with open('example.txt', 'r') as f:
sub_datas = []
for line in f.readlines():
if not line.startswith('<'):
items = line.strip()[:-1].split("=")
sub_datas.append({
items[0]: items[1]
})
elif line.startswith('<SUBEND'):
datas.append(sub_datas)
sub_datas = []
print(json.dumps(datas, indent=4))
if __name__ == '__main__':
main()
Related
The text file looks like
data/File_10265.data:
Apple:2kg
Apple:3kg
Banana:1kg
Banana:4kg
Some string1
data/File_10276.data:
Apple:6kg
Apple:5kg
Apple:3kg
Banana:2kg
Banana:4kg
Banana:2kg
Banana:4kg
Extra line
data/File_10278.data:
Apple:3kg
Banana:2kg
Banana:4kg
Banana:2kg
Banana:7kg
Some words
The code is as follows:
import re
import pandas as pd
f = open("Samplefruit.txt", "r")
lines = f.readlines()
Apple_count=0
Banana_count=0
File_count=0
Filename_list=[]
Apple_list=[]
Banana_list=[]
for line in lines:
match1=re.findall('data/(?P<File>[^\/]+(?=\..*data))',line)
if match1:
Filename_list.append(match1[0])
print('Match found:',match1)
if line.startswith("Apple"):
Apple_count+=1
elif line.startswith("Banana"):
Banana_count+=1
Apple_list.append(Apple_count)
Banana_list.append(Banana_count)
df=pd.DataFrame({'Filename': Filename_list,'Apple':
Apple_list,'Banana':
Banana_list})
The desired output:
Filename: |Apple |Banana
File_10265|2 |2
File_10276|3 |4
File_10278|1 |4
Maybe there is a more efficient way to do this but here's one solution:
with open('filetest.txt') as f:
lines = f.readlines()
unique_lines = list(dict.fromkeys(lines))
for line in unique_lines:
print(line + str(lines.count(line)))
f1 = open('file.txt', 'a')
f1.write(line + str(lines.count(line)))
f1.close()
You simply open the file, read all lines into a list, then get rid of any duplicates. Then you loop through the list (now with the duplicates removed), and use the .count (docs) function to get the number of occurrences of each unique item in the list.
Try this,
pattern = re.compile(r"data/File_[\d]+.data:")
lines = text.split("\n")
files = itertools.groupby(lines, lambda line:pattern.search(line) == None)
for k, content in files:
if k == True:
content = list(content)
all_words = list(set(content))
counts = {word:content.count(word) for word in all_words if word != ""}
print(counts)
Output -
{'Banana:': 2, 'Apple:': 2}
{'Banana:': 4, 'Apple:': 3}
{'Banana:': 4, 'Apple:': 1}
NOTE: New changes have been made to the code as per the changes in the question.
Try this:
import re
text = {}
def unit_cal(val1, val2): #function to add quantities with units and return the final answer with units
q1 = re.findall("[0-9]+", val1)
unit = re.findall("[a-zA-Z]+", val1)
if (val2 != False):
q2 = re.findall("[0-9]+", val2)
ans = int(q1[0]) + int(q2[0])
else:
ans = int(q1[0])
return str(ans) + unit[0] #remove + unit[0] to return only the value
with open("item.txt", "r") as f1:
for line in f1:
if ("data" in line):
temp_key = line
k = {}
text[temp_key] = k
elif (line.strip() != ""):
temp_word = line.strip().split(":")
if temp_word[0] in text[temp_key]:
text[temp_key][temp_word[0]] = unit_cal(temp_word[1], text[temp_key][temp_word[0]])
else:
text[temp_key][temp_word[0]] = unit_cal(temp_word[1], False)
final_text = ""
for main_key in text:
final_text += main_key + "\n"
for sub_key in text[main_key]:
final_text += sub_key + " : " + str(text[main_key][sub_key]) + "\n\n"
print(final_text) #final output is displayed in the idle
with open("new_items.txt", "w") as f2:
f2.write(final_text) #the output is also written to a new file
Output:
data/File_10265.data:
Apple : 5kg
Banana : 5kg
data/File_10276.data:
Apple : 14kg
Banana : 12kg
data/File_10278.data:
Apple : 3kg
Banana : 15kg
Here, I have posted an answer. Thanks, #Mani,#CarySwoveland, #Zero, and #M B for your support. The code is as follows:
import pandas as pd
text = {}
with open(r"Samplefruit.txt", "r") as file:
for line in file:
if "data" in line:
Filename=line.split('/')[-1].split('.')[0]
Apple_count=0
Banana_count=0
print('----------------')
print(Filename)
elif ("Apple" in line or "Banana" in line):
if line.startswith("Apple"):
Apple_count+=1
elif line.startswith("Banana"):
Banana_count+=1
print('Apple:',Apple_count)
print('Banana:',Banana_count)
text[Filename] = {'Apple':Apple_count,'Banana':Banana_count}
File_list.append(Filename)
df = pd.DataFrame(
{"Filename": text.keys(), "Apple": [x['Apple'] for x in text.values()],"Banana": [x['Banana'] for x in text.values()]}
)
print(df)
I have a list of Strings:
myStrings = [Account,Type, myID]
I also have a txt file with numbers associated with these Strings, e.g:
[90: 'Account', 5: 'Type', 6: 'MyID', 8: 'TransactionNum', 9: 'Time']
How can I print only the numbers and strings in the txt file that are in myStrings. For example, since 'Time' is not in myStrings, I do not want to print it. I also would like to make this txt file a list
This should help u:
myStrings = ['Account','Type', 'myID']
f = open("D:\\Test.txt","r")
txt = f.read()
f.close()
txt = txt.replace('\n',' ')
txt = txt.split(',')
txtlst = []
for x in txt:
txtlst.append(x.split(':'))
numslst = [int(txtlst[i][0]) for i in range(len(txtlst))]
strlst = []
for i in txtlst:
for j in i:
try:
int(j)
except ValueError:
strlst.append(j.replace("'",""))
for x in range(len(strlst)):
strlst[x] = strlst[x].replace(' ','')
for x in range(len(strlst)):
if strlst[x] in myStrings:
print(numslst[x])
print(strlst[x])
Output:
90
Account
5
Type
After you have said that the file does not have [ & ] , could make it work like so :
import json
myStrings = ['Account','Type', 'myID']
with open('text-file.txt') as filename:
file_text = filename.read()
file_text_list = file_text.split(',')
file_text_dict = {}
for item in file_text_list:
k, v = item.split()
v = v.replace("'", "")
k = k.replace(":", "")
if v in myStrings:
file_text_dict[k] = v
print(file_text_dict) # output => {'90': 'Account', '5': 'Type'}
print(list(file_text_dict.values())) # output => ['Account', 'Type']
10.2 Write a program to read through the mbox-short.txt and figure out the distribution by hour of the day for each of the messages. You can pull the hour out from the 'From ' line by finding the time and then splitting the string a second time using a colon.
From stephen.marquard#uct.ac.za Sat Jan 5 09:14:16 2008
Once you have accumulated the counts for each hour, print out the counts, sorted by hour as shown below.
My Code:
fname = input("Enter file:")
fhandle = open(fname)
dic={}
for line in fhandle:
if not line.startswith("From "):
continue
else:
line=line.split()
line=line[5] # accesing the list using index and splitting it
line=line[0:2]
for bline in line:
dic[bline]=dic.get(bline,0)+1 # Using this line we created a dictionary having keys and values
#Now it's time to access the dictionary and sort in some way.
lst=[]
for k1,v1 in dic.items(): # dictionary er key value pair access korar jonno items method use kora hoyechhe
lst.append((k1,v1)) # dictionary er keys and corresponding values ke lst te append korlam
lst.sort() #lst take sort korlam. sorting is done through key
#print(lst)
for k1,v1 in lst: # we are able to access this list using key value pair as it was basically a dictionary before, It is just appended
print(k1,v1)
#print(dic)
#print(dic)
Desired Output:
04 3
06 1
07 1
09 2
10 3
11 6
14 1
15 2
16 4
17 2
18 1
19 1
My Output:
enter image description here
I don't understand what's going wrong.
Working Code. Break down the code in to simple form as much i can. So it will be easy to understand for you.
d = dict()
lst = list()
fname = input('enter the file name : ')
try:
fopen = open(fname,'r')
except:
print('wrong file name !!!')
for line in fopen:
stline = line.strip()
if stline.startswith('From:'):
continue
elif stline.startswith('From'):
spline = stline.split()
time = spline[5]
tsplit = time.split(':')
t1 = tsplit[0].split()
for t in t1:
if t not in d:
d[t] = 1
else:
d[t] = d[t] + 1
for k,v in d.items():
lst.append((k,v))
lst = sorted(lst)
for k,v in lst:
print(k,v)
name = input("Enter file:")
if len(name) < 1 : name = "mbox-short.txt"
handle = open(name)
emailcount = dict()
for line in handle:
if not line.startswith("From "): continue
line = line.split()
line = line[1]
emailcount[line] = emailcount.get(line, 0) +1
bigcount = None
bigword = None
for word,count in emailcount.items():
if bigcount == None or count > bigcount:
bigcount = count
bigword = word
print(bigword, bigcount)
name = input("Enter file:")
if len(name) < 1 : name = "mbox-short.txt"
handle = open(name)
counts = {}
for line in handle:
word = line.split()
if len(word) < 3 or word[0] != "From" : continue
full_hour = word[5]
hour = full_hour.split(":")
hour = str(hour[:1])
hour = hour[2:4]
if hour in counts :
counts[hour] = 1 + counts[hour]
else :
counts.update({hour:1})
lst = list()
for k, v in counts.items():
new_tup = (k, v)
lst.append(new_tup)
lst = sorted(lst)
for k, v in lst:
print(k,v)
counts=dict()
fill=open("mbox-short.txt")
for line in fill :
if line.startswith("From "):
x=line.split()
b=x[5]
y=b.split(":")
f=y[0]
counts[f]=counts.get(f,0)+1
l=list()
for k,v in counts.items():
l.append((k,v))
l.sort()
for k,v in l:
print(k,v)
I listen carefully in the online lessons, This is my code by what i learned in class. I think it will be easy for you to understand.
fn = input('Please enter file: ')
if len(fn) < 1: fn = 'mbox-short.txt'
hand = open(fn)
di = dict()
for line in hand:
ls = line.strip()
wds = line.split()
if 'From' in wds and len(wds) > 2:
hours = ls.split()
hour = hours[-2].split(':')
ch = hour[0]
di[ch] = di.get(ch, 0) + 1
tmp = list()
for h,t in di.items():
newt = (h,t)
tmp.append(newt)
tmp = sorted(tmp)
for h,t in tmp:
print(h,t)
name = input("Enter file:")
if len(name) < 1:
name = "mbox-short.txt"
handle = open(name)
counts = dict()
for line in handle:
line = line.strip()
if not line.startswith("From ") : continue
line = line.split()
hr = line[5].split(":")
hr = hr[0:1]
for piece in hr:
counts[piece] = counts.get(piece,0) + 1
lst = list()
for k,v in counts.items():
lst.append((k,v))
lst = sorted(lst)
for k,v in lst:
print(k,v)
fname = input("Enter file:")
fhandle = open(fname)
dic={}
for line in fhandle:
if not line.startswith('From '):
continue
else:
line=line.split()
line=line[5] # accesing the list using index and splitting it
line=line.split(':')
bline=line[0]
#for bline in line:
#print(bline)
dic[bline]=dic.get(bline,0)+1 # Using this line we created a
dictionary having keys and values
#Now it's time to access the dictionary and sort in some way.
lst=[]
for k1,v1 in dic.items(): # dictionary er key value pair access korar jonno items method use kora hoyechhe
lst.append((k1,v1)) # dictionary er keys and corresponding values ke lst te append korlam
lst.sort() #lst take sort korlam. sorting is done through key
#print(lst)
for k1,v1 in lst: # we are able to access this list using key value pair as it was basically a dictionary before, It is just appended
print(k1,v1)
#print(dic)
#print(dic)
I have a text file of format like this
10:45 a b c
x 0 1 2
y 4 5 6
z 7 8 9
I want to make x as key and 0,1,2 its value in the list.Same with y and z
while(1):
line = f.readline()
if time in line:
print (line)
L1.append(line)
for count,line in enumerate(f):
if (i < 3):
L1.append(line)
print ("Line{} : {}".format(count,line.strip()))
i=i+1
#print(L1)
for k in range(1):
print(L1[k])
test1 = L1[k]
a1 = test1.split()
print (a1[1])
dict = {a1[1]: L1[k] for a1[1] in L1[k]}
print (dict)
for k in range(1,3):
#print("hey")
print (L1[k]) #will list the single row
test = L1[k]
#print(test)
a = test.split()
print (a[0])
dict = {a[0]:L1[k] for a[0] in L1[k]}
print (dict)
Any idea what i am doing wrong here?
P.S. - I am new to python
You could try this:
my_dict = {}
lines = f.readLines()
lines.pop(0)
for line in lines:
line_list = line.split(' ')
key = line_list.pop(0)
my_dict.update({key: line_list})
This will accomplish what it is I think you need (assuming your text file is stored in the same directory and you replace 'test.txt' with your filename):
with open('test.txt', 'r') as values:
contents = values.read().strip().split()
new_dict = {}
i = 4
while i <= len(contents)-4:
new_dict.update({contents[i]: contents[i+1:i+4]})
i += 4
print(new_dict)
or this, if you want the values as integers:
with open('test.txt', 'r') as values:
contents = values.read().strip().split()
new_dict = {}
i = 4
while i <= len(contents)-4:
new_dict.update({contents[i]: [int(contents[i+1]),int(contents[i+2]),int(contents[i+3])]})
i += 4
print(new_dict)
Try this
import string
start_found = False
result_dict = {}
for line in open("stack1_input.txt", mode='r'):
if (line.startswith("10:45")):
start_found = True
continue
if start_found == True:
values = line.split()
if len(values) == 4:
result_dict[values[0]] = values[1:]
print (result_dict)
I have a list like this
GroupID,Number
yellow,1
yellow,2
tan,0
blue,1
black,2
black,3
What I want is this
GroupID,Number
yellow,3
tan, 0
blue,1
black,5
So I want to add the numbers associated with each groupID.
This is what I got, but have difficulty with the result statement:
from collections import defaultdict
d = defaultdict(list)
f = open("metal_modules.csv","r")
sheet = f.readlines()
#print sheet
for line in sheet[1:]:
#print line
spl = line.strip().split(",")
#print spl[1]
name = spl[0]
d[name].append(spl[1])
outfile = open("out.txt","w")
result = ""
for v in d.values():
result = #here I need to sum the number in column two for each key in the dictionary#
#print result
outfile.write(result)
f.close()
outfile.close()
keep it simple
result = ""
for group in d:
result += "%s, %s\n" % (group, sum(n for n in d[group]))
You could try the below if the order won't be an important issue for you.
from collections import defaultdict
with open('infile') as f:
d = defaultdict(list)
h = f.readline()
m = f.readlines()
for i in m:
s = i.rstrip().split(',')
d[s[0]].append(s[1])
with open('outfile', 'w') as w:
w.write(h)
for i in d.items():
w.write(i[0]+","+str(sum(map(int,i[1])))+"\n")
Take a look at the following:
with open("metal_modules.csv","r") as f:
sheet = f.readlines()
counter = {}
for line in sheet[1:]:
k,v = line.split(",")
if k in counter:
counter[k] += int(v)
else:
counter[k] = int(v)
with open("out.txt","w") as outfile:
result = "GroupID,Number\n"
for item in counter:
result += "%s,%s\n" % (item,counter[item])
outfile.write(result.strip())