Get a part of a token after a specific character

Get a part of a token after a specific character - python

I want to get a part of token in a text file. So far I wrote the code below:
from collections import Counter
import re
freq_dist = set()
words = re.findall(r'[\w+]+', open('output.txt').read())
freq_dist = Counter(words).most_common(10)
print(freq_dist)
My output.txt is as follows:
Türkiye+Noun ,+Punc terörizm+Noun+Gen ve+Conj kitle+Noun imha+Noun silah+Noun+A3pl+P3sg+Gen küresel+Adj düzey+Noun+Loc olus+Verb+Caus+PastPart+P3sg tehdit+Noun+Gen boyut+Noun+P3sg karsi+Adj+P3sg+Loc ,+Punc tüm+Det ülke+Noun+A3pl+Gen yay+Verb+Pass+Inf2+Gen önle+Verb+Pass+Inf2+P3sg hedef+Noun+A3pl+P3sg+Acc paylas+Verb+PastPart+P3pl ,+Punc daha+Noun güven+Noun+With ve+Conj istikrar+Noun+With bir+Num dünya+Noun düzen+Noun+P3sg için+PostpPCGen birlik+Noun+Loc çaba+Noun göster+Verb+PastPart+P3pl bir+Num asama+Noun+Dat gel+Verb+Pass+Inf2+P3sg+Acc samimi+Adj ol+Verb+ByDoingSo arzula+Verb+Prog2+Cop .+Punc
Ab+Noun ile+PostpPCNom gümrük+Noun Alan+Noun+P3sg+Loc+Rel kurumsal+Adj iliski+Noun+A3pl
club+Noun toplanti+Noun+A3pl+P3sg
Türkiye+Noun+Gen -+Punc At+Noun gümrük+Noun isbirlik+Noun+P3sg komite+Noun+P3sg ,+Punc Ankara+Noun Anlasma+Noun+P3sg+Gen 6+Num madde+Noun+P3sg uyar+Verb+When ortaklik+Noun rejim+Noun+P3sg+Gen uygula+Verb+Pass+Inf2+P3sg+Acc ve+Conj gelis+Verb+Inf2+P3sg+Acc sagla+Verb+Inf1 üzere+PostpPCNom ortaklik+Noun Konsey+Noun+P3sg+Gen 2+Num /+Punc 69+Num sayili+Adj karar+Noun+P3sg ile+Conj teknik+Noun komite+Noun mahiyet+Noun+P3sg+Loc kur+Verb+Pass+Narr+Cop .+Punc
nispi+Adj
nisbi+Adj
görece+Adj+With
izafi+Adj
obur+Adj
I want to get the parts after the first + sign and save them in a list in a descending form. Forexaple, in Türkiye+Noun I want to get +Noun part or in terörizm+Noun+Gen I want to get Noun+gen or in isbirlik+Noun+P3sg I want to get Noun+P3sg and after this I want to list them by their count in a descending order like how many times +Noun or +Noun+gen appeared in the text.

How about splitting your input on spaces?
from collections import Counter
words = [word.split('+', 1)[1].strip() for word in open('output.txt').read().split(' ') if len(word)]
freq_dist = Counter(words).most_common(10)
print(freq_dist)
This would give you:
[('Noun', 16), ('Punc', 8), ('Adj', 8), ('Noun+P3sg', 6), ('Num', 5), ('Conj', 4), ('Noun+Gen', 3), ('Noun+P3sg+Gen', 3), ('Noun+Loc', 2), ('Verb+PastPart+P3pl', 2)]

Related

Convert CSV with array of tuples back to int

I have an array of tuples that are stored in a csv line by line and I want to convert them back, but each time I convert them back they are still strings and I need them to be ints.
"(1013, 294)","(872, 258)","(744, 190)","(704, 124)","(758, 78)","(853, 121)","(862, 68)","(861, 130)","(861, 166)","(972, 123)","(979, 67)","(956, 145)","(949, 177)","(1088, 136)","(1096, 85)","(1061, 155)","(1050, 188)","(1201, 158)","(1198, 121)","(1152, 168)","(1132, 194)"
"(1037, 305)","(906, 259)","(798, 192)","(756, 126)","(790, 78)","(894, 109)","(882, 29)","(873, -14)","(875, -52)","(1010, 119)","(1046, 72)","(1012, 150)","(990, 192)","(1122, 139)","(1156, 101)","(1101, 174)","(1069, 209)","(1224, 172)","(1248, 140)","(1189, 187)","(1152, 214)"
"(1031, 315)","(891, 269)","(812, 196)","(863, 130)","(968, 101)","(865, 117)","(813, 39)","(791, -10)","(778, -54)","(985, 113)","(989, 17)","(997, -33)","(1004, -70)","(1102, 132)","(1135, 57)","(1093, 105)","(1056, 152)","(1208, 170)","(1219, 124)","(1163, 156)","(1117, 192)"
Desired output should look like this:
handData =[(1013, 294), (872, 258), (744, 190), (704, 124), (758, 78), (853, 121), (862, 68), (861, 130), (861, 166), (972, 123), (979, 67), (956, 145), (949, 177), (1088, 136), (1096, 85), (1061, 155), (1050, 188), (1201, 158), (1198, 121), (1152, 168), (1132, 194)]
Current code:
with open('gesture_data.csv', 'r', newline='') as f:
reader = csv.reader(f)
examples = list(reader)
Gives this:
[['(1013, 294)', '(872, 258)', '(744, 190)', '(704, 124)', '(758, 78)', '(853, 121)', '(862, 68)', '(861, 130)', '(861, 166)', '(972, 123)', '(979, 67)', '(956, 145)', '(949, 177)', '(1088, 136)', '(1096, 85)', '(1061, 155)', '(1050, 188)', '(1201, 158)', '(1198, 121)', '(1152, 168)', '(1132, 194)'], ['(1037, 305)', '(906, 259)', '(798, 192)', '(756, 126)', '(790, 78)', '(894, 109)', '(882, 29)', '(873, -14)', '(875, -52)', '(1010, 119)', '(1046, 72)', '(1012, 150)', '(990, 192)', '(1122, 139)', '(1156, 101)', '(1101, 174)', '(1069, 209)', '(1224, 172)', '(1248, 140)', '(1189, 187)', '(1152, 214)'], ['(1031, 315)', '(891, 269)', '(812, 196)', '(863, 130)', '(968, 101)', '(865, 117)', '(813, 39)', '(791, -10)', '(778, -54)', '(985, 113)', '(989, 17)', '(997, -33)', '(1004, -70)', '(1102, 132)', '(1135, 57)', '(1093, 105)', '(1056, 152)', '(1208, 170)', '(1219, 124)', '(1163, 156)', '(1117, 192)']]

You can use a regex expression to match numbers from couples and map them to integers:
import re
handData = [tuple(map(int, re.findall('\d+', string[1:-1]))) for string in examples[0]]

Python / Get unique tokens from a file with a exception

I want to find the number of unique tokens in a file. For this purpose I wrote the below code:
splittedWords = open('output.txt', encoding='windows-1252').read().lower().split()
uniqueValues = set(splittedWords)
print(uniqueValues)
The output.txt file is like this:
Türkiye+Noun ,+Punc terörizm+Noun+Gen ve+Conj kitle+Noun imha+Noun silah+Noun+A3pl+P3sg+Gen küresel+Adj düzey+Noun+Loc olus+Verb+Caus+PastPart+P3sg tehdit+Noun+Gen boyut+Noun+P3sg karsi+Adj+P3sg+Loc ,+Punc tüm+Det ülke+Noun+A3pl+Gen yay+Verb+Pass+Inf2+Gen önle+Verb+Pass+Inf2+P3sg hedef+Noun+A3pl+P3sg+Acc paylas+Verb+PastPart+P3pl ,+Punc daha+Noun güven+Noun+With ve+Conj istikrar+Noun+With bir+Num dünya+Noun düzen+Noun+P3sg için+PostpPCGen birlik+Noun+Loc çaba+Noun göster+Verb+PastPart+P3pl bir+Num asama+Noun+Dat gel+Verb+Pass+Inf2+P3sg+Acc samimi+Adj ol+Verb+ByDoingSo arzula+Verb+Prog2+Cop .+Punc
Ab+Noun ile+PostpPCNom gümrük+Noun Alan+Noun+P3sg+Loc+Rel kurumsal+Adj iliski+Noun+A3pl
club+Noun toplanti+Noun+A3pl+P3sg
Türkiye+Noun+Gen -+Punc At+Noun gümrük+Noun isbirlik+Noun+P3sg komite+Noun+P3sg ,+Punc Ankara+Noun Anlasma+Noun+P3sg+Gen 6+Num madde+Noun+P3sg uyar+Verb+When ortaklik+Noun rejim+Noun+P3sg+Gen uygula+Verb+Pass+Inf2+P3sg+Acc ve+Conj gelis+Verb+Inf2+P3sg+Acc sagla+Verb+Inf1 üzere+PostpPCNom ortaklik+Noun Konsey+Noun+P3sg+Gen 2+Num /+Punc 69+Num sayili+Adj karar+Noun+P3sg ile+Conj teknik+Noun komite+Noun mahiyet+Noun+P3sg+Loc kur+Verb+Pass+Narr+Cop .+Punc
nispi+Adj
nisbi+Adj
görece+Adj+With
izafi+Adj
obur+Adj
With this code I can get the unique tokens like Türkiye+Noun, Türkiye+Noun+Gen. But I want to get forexample Türkiye+Noun, Türkiye+Noun+Gen like only one token before the + sign. I only want Türkiye part. In the end Türkiye+Noun and Türkiye+Noun+Gen tokens needs to be same and only treated as a single unique token. I think I need to write regex for this purpose.

It seems the word you want is always the 1st in a list of '+'-joined words:
Split the splitted words at + and take the 0th one:
text = """Türkiye+Noun ,+Punc terörizm+Noun+Gen ve+Conj kitle+Noun imha+Noun silah+Noun+A3pl+P3sg+Gen küresel+Adj düzey+Noun+Loc olus+Verb+Caus+PastPart+P3sg tehdit+Noun+Gen boyut+Noun+P3sg karsi+Adj+P3sg+Loc ,+Punc tüm+Det ülke+Noun+A3pl+Gen yay+Verb+Pass+Inf2+Gen önle+Verb+Pass+Inf2+P3sg hedef+Noun+A3pl+P3sg+Acc paylas+Verb+PastPart+P3pl ,+Punc daha+Noun güven+Noun+With ve+Conj istikrar+Noun+With bir+Num dünya+Noun düzen+Noun+P3sg için+PostpPCGen birlik+Noun+Loc çaba+Noun göster+Verb+PastPart+P3pl bir+Num asama+Noun+Dat gel+Verb+Pass+Inf2+P3sg+Acc samimi+Adj ol+Verb+ByDoingSo arzula+Verb+Prog2+Cop .+Punc
Ab+Noun ile+PostpPCNom gümrük+Noun Alan+Noun+P3sg+Loc+Rel kurumsal+Adj iliski+Noun+A3pl
club+Noun toplanti+Noun+A3pl+P3sg
Türkiye+Noun+Gen -+Punc At+Noun gümrük+Noun isbirlik+Noun+P3sg komite+Noun+P3sg ,+Punc Ankara+Noun Anlasma+Noun+P3sg+Gen 6+Num madde+Noun+P3sg uyar+Verb+When ortaklik+Noun rejim+Noun+P3sg+Gen uygula+Verb+Pass+Inf2+P3sg+Acc ve+Conj gelis+Verb+Inf2+P3sg+Acc sagla+Verb+Inf1 üzere+PostpPCNom ortaklik+Noun Konsey+Noun+P3sg+Gen 2+Num /+Punc 69+Num sayili+Adj karar+Noun+P3sg ile+Conj teknik+Noun komite+Noun mahiyet+Noun+P3sg+Loc kur+Verb+Pass+Narr+Cop .+Punc
nispi+Adj
nisbi+Adj
görece+Adj+With
izafi+Adj
obur+Adj """
splittedWords = text.lower().replace("\n"," ").split()
uniqueValues = set( ( s.split("+")[0] for s in splittedWords))
print(uniqueValues)
Output:
{'imha', 'çaba', 'ülke', 'arzula', 'terörizm', 'olus', 'daha', 'istikrar', 'küresel',
'sagla', 'önle', 'üzere', 'nisbi', 'türkiye', 'gelis', 'bir', 'karar', 'hedef', '2',
've', 'silah', 'kur', 'alan', 'club', 'boyut', '-', 'anlasma', 'iliski',
'izafi', 'kurumsal', 'karsi', 'ankara', 'ortaklik', 'obur', 'kitle', 'güven',
'uygula', 'ol', 'düzey', 'konsey', 'teknik', 'rejim', 'komite', 'gümrük', 'samimi',
'gel', 'yay', 'toplanti', '.', 'asama', 'mahiyet', 'ab', '69', 'için',
'paylas', '6', '/', 'nispi', 'dünya', 'at', 'sayili', 'görece', 'isbirlik', 'birlik',
',', 'tüm', 'ile', 'düzen', 'uyar', 'göster', 'tehdit', 'madde'}
You might need to do some additional cleanup to remove things like
',' '6' '/'
Split and remove anything thats just numbers or punctuation
from string import digits, punctuation
remove=set(digits+punctuation)
splittedWords = text.lower().split()
uniqueValues = set( ( s.split("+")[0] for s in splittedWords))
# remove from set anything that only consists of numbers or punctuation
uniqueValues = uniqueValues - set ( x for x in uniqueValues if all(c in remove for c in x))
print(uniqueValues)
to get it as:
{'teknik', 'yay', 'göster','hedef', 'terörizm', 'ortaklik','ile', 'daha', 'ol', 'istikrar',
'paylas', 'nispi', 'üzere', 'sagla', 'tüm', 'önle', 'asama', 'uygula', 'güven', 'kur',
'türkiye', 'gel', 'dünya', 'gelis', 'sayili', 'ab', 'club', 'küresel', 'imha', 'çaba',
'olus', 'iliski', 'izafi', 'mahiyet', 've', 'düzey', 'anlasma', 'tehdit', 'bir', 'düzen',
'obur', 'samimi', 'boyut', 'ülke', 'arzula', 'rejim', 'gümrük', 'karar', 'at', 'karsi',
'nisbi', 'isbirlik', 'alan', 'toplanti', 'ankara', 'birlik', 'kurumsal', 'için', 'kitle',
'komite', 'silah', 'görece', 'uyar', 'madde', 'konsey'}

You can split all the tokens you have now on "+" and take only the first one.
uniqueValues = set(map(lambda x: x.split('+')[0], splittedWords))
Here I use map. Map will apply the function (the lambda part) on all values of the splittedWords.

write data in to a csv according to headers names, which indicate occurrences of items

I need to enter data in to csv using headers and put a value if the flag is available in the event else zero it. Required output is:
I am currently getting:
This is my current code, I would like to know how to generate my desired output:
inputs for code is counter1-4 shown below :
OrderedDict([('flags=40', 3971), ('flags=10004', 6244), ('flags=10100', 236), ('flags=90002', 2), ('flags=80', 2009), ('flags=10080', 5421), ('flags=4', 2886), ('flags=100', 227), ('flags=80002', 58), ('flags=10040', 8990), ('flags=0', 5)])
OrderedDict([('flags=40', 16), ('flags=10004', 6244), ('flags=10100', 236), ('flags=90002', 2), ('flags=10080', 5421), ('flags=4', 16), ('flags=80002', 11), ('flags=10040', 8990), ('flags=0', 4), ('Total', 20940)])
OrderedDict([('flags=4', 1332), ('flags=40', 1839), ('flags=80002', 3), ('flags=100', 197), ('flags=80', 935), ('Total', 4306)])
OrderedDict([('Total', 0)])
OrderedDict([('flags=40', 2116), ('flags=80', 1074), ('flags=4', 1538), ('flags=100', 30), ('flags=80002', 44), ('flags=0', 1), ('Total', 4803)])
dat = 1
with open(outputcsv,'wb') as outcsv:
writer = csv.writer(outcsv,delimiter=',')
appname = inputfile[:-3]
writer.writerow(appname.split(','))
for x in threads:
writer.writerows([x.split(',')])
#w.writeheader([x.split(',')])
if dat == 1:
w = csv.DictWriter(outcsv,counter1.keys())
w.writeheader()
w.writerow(counter1)
elif dat == 2:
w = csv.DictWriter(outcsv,counter2.keys())
w.writeheader()
w.writerow(counter2)
elif dat == 3:
w = csv.DictWriter(outcsv,counter3.keys())
w.writeheader()
w.writerow(counter3)
elif dat == 4:
w = csv.DictWriter(outcsv,counter4.keys())
w.writeheader()
w.writerow(counter4)
dat = dat +1
writer.writerows('\n')
code for how threads are being read:
exampleFile = open('top_tasks.csv')
exampleReader = csv.reader(exampleFile)
exampleData = list(exampleReader)
thread1 = exampleData[11][0]
thread2 = exampleData[12][0]
thread3 = exampleData[13][0]
thread4 = exampleData[14][0]
threads = [thread1,thread2,thread3,thread4]

I think this code meets your requirements:
from collections import OrderedDict
import csv
# build an OrderedDict of all keys
all_keys = OrderedDict()
# first column gets name of data set
all_keys[data_set_name] = data_set_name
# collect all of the known keys, and insert the thread name
for counter, thread in zip(counters, threads):
all_keys.update(counter)
counter[data_set_name] = thread
with open(outputcsv, 'wb') as outcsv:
# using all known keys, create a csv writer
w = csv.DictWriter(outcsv, fieldnames=all_keys.keys())
# output the header and data rows
w.writeheader()
w.writerows(counters)
Data Used:
outputcsv = 'output.csv'
counters = [
OrderedDict(
[('flags=40', 3971), ('flags=10004', 6244), ('flags=10100', 236),
('flags=90002', 2), ('flags=80', 2009), ('flags=10080', 5421),
('flags=4', 2886), ('flags=100', 227), ('flags=80002', 58),
('flags=10040', 8990), ('flags=0', 5)]),
OrderedDict(
[('flags=40', 16), ('flags=10004', 6244), ('flags=10100', 236),
('flags=90002', 2), ('flags=10080', 5421), ('flags=4', 16),
('flags=80002', 11), ('flags=10040', 8990), ('flags=0', 4),
('Total', 20940)]),
OrderedDict([('flags=4', 1332), ('flags=40', 1839), ('flags=80002', 3),
('flags=100', 197), ('flags=80', 935), ('Total', 4306)]),
OrderedDict([('Total', 0)]),
OrderedDict([('flags=40', 2116), ('flags=80', 1074), ('flags=4', 1538),
('flags=100', 30), ('flags=80002', 44), ('flags=0', 1),
('Total', 4803)]),
]
# code assumes thread names are in a list, make some sample names
threads = ['thread%d' % (i+1) for i in range(len(counters))]
# first column header if the name of the data set
data_set_name = 'CandyCrush 1'

Writing numbers into file python

I was trying to extract all the elements of the my data points (x,y) tuples, and put them into list of x values and y list, and transfer them to two columns in excel spreadsheet. It seems writing numbers into file is quite difficult. Can anyone shed a light on this problem?
Current state:
xlist=[list[i][0] for i in range(len(list))]
ylist=[list[i][1] for i in range(len(list))]
fob=open('c:/test/a.txt','w')
fob.write(xlist[i] for i in range(len(xlist))
i want to write down a column of numbers in notepad so that I can highlight and copy into spread sheet directly .
Below are my data.
list = [(0.496, 12.49), (0.531, 12.40), (0.578, 12.18), (0.615,
11.96), (0.657, 11.75), (0.731, 11.28), (0.785, 10.85), (0.812,
10.61), (0.883, 9.92), (0.930, 9.40), (0.979, 8.77), (1.026,
8.10), (1.081, 7.23), (1.134, 6.33), (1.189, 5.39), (1.220,
4.85), (1.273, 3.92), (1.332, 2.91), (1.364, 2.55), (1.418,
2.16), (1.467, 1.65), (1.523, 1.17), (1.569, 0.82), (1.626,
0.47), (1.678, 0.21), (1.723, 0.01), (1.776, 0.19), (1.814,
0.28), (1.869, 0.36), (1.933, 0.36), (1.972, 0.31), (2.021,
0.18), (2.081, 0.13), (2.129, 0.46), (2.169, 0.79), (2.219,
1.24), (2.280, 1.84), (2.306, 2.11), (2.358, 2.67), (2.414,
3.37), (2.471, 4.05), (2.505, 4.51), (2.562, 5.22), (2.613,
5.84), (2.652, 6.31), (2.712, 7.01), (2.758, 7.52), (2.802,
7.99), (2.869, 8.63), (2.930, 9.16), (2.971, 9.57), (3.043,
10.35), (3.078, 10.69), (3.119, 11.00), (3.174, 11.26), (3.217,
11.40), (3.261, 11.53), (3.307, 11.55), (3.371, 11.51), (3.432,
11.40), (3.479, 11.26), (3.507, 11.20), (3.557, 11.00), (3.623,
10.55), (3.663, 10.28), (3.729, 9.79), (3.768, 9.57), (3.825,
9.24), (3.880, 8.85), (3.944, 8.41), (3.969, 8.04), (4.014,
7.55), (4.086, 6.67), (4.105, 6.37), (4.166, 5.50), (4.212,
4.88), (4.266, 4.20), (4.311, 3.69), (4.364, 3.06), (4.401,
2.65), (4.453, 2.09), (4.497, 1.68), (4.556, 1.18), (4.602,
0.85), (4.644, 0.57), (4.695, 0.29), (4.754, 0.04), (4.799,
0.11), (4.847, 0.17), (4.918, 0.11), (4.959, 0.04), (4.992,
0.19), (5.063, 0.64), (5.098, 0.90), (5.157, 1.40), (5.201,
1.79), (5.245, 2.20), (5.291, 2.65), (5.326, 3.00), (5.387,
3.65), (5.420, 4.02), (5.469, 4.62), (5.538, 5.44), (5.579,
5.96), (5.629, 6.57), (5.674, 7.14), (5.724, 7.73), (5.798,
8.60), (5.823, 8.88), (5.888, 9.62), (5.919, 9.94), (5.963,
10.41), (6.009, 10.85), (6.050, 11.22), (6.115, 11.71), (6.153,
11.99), (6.222, 12.39), (6.263, 12.61), (6.302, 12.77), (6.377,
12.99), (6.414, 13.03), (6.454, 13.02), (6.522, 12.89), (6.558,
12.74), (6.626, 12.41), (6.677, 12.05), (6.729, 11.64), (6.791,
11.00), (6.832, 10.58), (6.887, 9.92), (6.949, 9.13), (6.996,
8.48), (7.028, 8.09), (7.094, 7.13), (7.123, 6.70), (7.161,
6.16), (7.213, 5.35), (7.250, 4.81), (7.332, 3.61), (7.382,
2.93), (7.420, 2.45), (7.474, 1.88), (7.514, 1.40), (7.576,
0.71), (7.600, 0.50), (7.662, 0.12), (7.725, 0.16), (7.768,
0.26), (7.810, 0.30), (7.858, 0.26), (7.904, 0.18), (7.980,
0.10), (8.021, 0.29), (8.078, 0.65), (8.133, 1.06), (8.165,
1.33), (8.218, 1.83), (8.267, 2.31), (8.321, 2.87), (8.355,
3.27), (8.413, 3.91), (8.473, 4.61), (8.519, 5.22), (8.553,
5.65), (8.643, 6.74), (8.678, 7.23), (8.734, 7.94), (8.760,
8.27), (8.803, 8.81), (8.851, 9.35), (8.905, 9.94), (8.961,
10.45), (9.009, 10.92), (9.053, 11.34), (9.106, 11.75), (9.166,
12.14), (9.228, 12.48), (9.292, 12.71), (9.340, 12.86), (9.384,
13.01), (9.412, 13.05), (9.452, 13.03), (9.472, 13.00)]
Cheers

Export it into a CSV file. Your use case is very simple and you should be able to do it using standard Python.
with open('output.csv', 'w') as f:
for x, y in l:
f.write("%s, %s\n" % (x, y))
Note: list is a reserved word in python and you should not be using it.

Use openpyxl to write .xslx files from Python:
import openpyxl
my_list = [(0.496, 12.49), (0.531, 12.40), (0.578, 12.18), (0.615,
11.96), (0.657, 11.75), (0.731, 11.28), (0.785, 10.85), (0.812,
10.61), (0.883, 9.92), (0.930, 9.40), (0.979, 8.77), (1.026,
8.10), (1.081, 7.23), (1.134, 6.33), (1.189, 5.39), (1.220,
4.85), (1.273, 3.92), (1.332, 2.91), (1.364, 2.55), (1.418,
2.16), (1.467, 1.65), (1.523, 1.17), (1.569, 0.82), (1.626,
0.47), (1.678, 0.21), (1.723, 0.01), (1.776, 0.19), (1.814,
0.28), (1.869, 0.36), (1.933, 0.36), (1.972, 0.31), (2.021,
0.18), (2.081, 0.13), (2.129, 0.46), (2.169, 0.79), (2.219,
1.24), (2.280, 1.84), (2.306, 2.11), (2.358, 2.67), (2.414,
3.37), (2.471, 4.05), (2.505, 4.51), (2.562, 5.22), (2.613,
5.84), (2.652, 6.31), (2.712, 7.01), (2.758, 7.52), (2.802,
7.99), (2.869, 8.63), (2.930, 9.16), (2.971, 9.57), (3.043,
10.35), (3.078, 10.69), (3.119, 11.00), (3.174, 11.26), (3.217,
11.40), (3.261, 11.53), (3.307, 11.55), (3.371, 11.51), (3.432,
11.40), (3.479, 11.26), (3.507, 11.20), (3.557, 11.00), (3.623,
10.55), (3.663, 10.28), (3.729, 9.79), (3.768, 9.57), (3.825,
9.24), (3.880, 8.85), (3.944, 8.41), (3.969, 8.04), (4.014,
7.55), (4.086, 6.67), (4.105, 6.37), (4.166, 5.50), (4.212,
4.88), (4.266, 4.20), (4.311, 3.69), (4.364, 3.06), (4.401,
2.65), (4.453, 2.09), (4.497, 1.68), (4.556, 1.18), (4.602,
0.85), (4.644, 0.57), (4.695, 0.29), (4.754, 0.04), (4.799,
0.11), (4.847, 0.17), (4.918, 0.11), (4.959, 0.04), (4.992,
0.19), (5.063, 0.64), (5.098, 0.90), (5.157, 1.40), (5.201,
1.79), (5.245, 2.20), (5.291, 2.65), (5.326, 3.00), (5.387,
3.65), (5.420, 4.02), (5.469, 4.62), (5.538, 5.44), (5.579,
5.96), (5.629, 6.57), (5.674, 7.14), (5.724, 7.73), (5.798,
8.60), (5.823, 8.88), (5.888, 9.62), (5.919, 9.94), (5.963,
10.41), (6.009, 10.85), (6.050, 11.22), (6.115, 11.71), (6.153,
11.99), (6.222, 12.39), (6.263, 12.61), (6.302, 12.77), (6.377,
12.99), (6.414, 13.03), (6.454, 13.02), (6.522, 12.89), (6.558,
12.74), (6.626, 12.41), (6.677, 12.05), (6.729, 11.64), (6.791,
11.00), (6.832, 10.58), (6.887, 9.92), (6.949, 9.13), (6.996,
8.48), (7.028, 8.09), (7.094, 7.13), (7.123, 6.70), (7.161,
6.16), (7.213, 5.35), (7.250, 4.81), (7.332, 3.61), (7.382,
2.93), (7.420, 2.45), (7.474, 1.88), (7.514, 1.40), (7.576,
0.71), (7.600, 0.50), (7.662, 0.12), (7.725, 0.16), (7.768,
0.26), (7.810, 0.30), (7.858, 0.26), (7.904, 0.18), (7.980,
0.10), (8.021, 0.29), (8.078, 0.65), (8.133, 1.06), (8.165,
1.33), (8.218, 1.83), (8.267, 2.31), (8.321, 2.87), (8.355,
3.27), (8.413, 3.91), (8.473, 4.61), (8.519, 5.22), (8.553,
5.65), (8.643, 6.74), (8.678, 7.23), (8.734, 7.94), (8.760,
8.27), (8.803, 8.81), (8.851, 9.35), (8.905, 9.94), (8.961,
10.45), (9.009, 10.92), (9.053, 11.34), (9.106, 11.75), (9.166,
12.14), (9.228, 12.48), (9.292, 12.71), (9.340, 12.86), (9.384,
13.01), (9.412, 13.05), (9.452, 13.03), (9.472, 13.00)]
book = openpyxl.Workbook()
sheet = book.active
for i, value in enumerate(my_list):
sheet.cell(row=i+1, column=1).value = value[0]
sheet.cell(row=i+1, column=2).value = value[1]
book.save('test.xlsx')

When you have data like numbers or objects in memory, it's generally not correct to dump that data directly into disk, you'll want to serialize it.
The easiest way to serialize it is with print which automatically calls the "serialization" method __str__. The problem with this serialization method is that's not always easy to deserialize.
When you have a data structure, like the matrix you describe, you'll want a serialization method that will preserve the structure and allow to reconstruct it in memory. In this case you can use CSV (through the csv module), JSON (through the json module) or many others.
Use CSV.

PyParsing: Is this correct use of setParseAction()?

I have strings like this:
"MSE 2110, 3030, 4102"
I would like to output:
[("MSE", 2110), ("MSE", 3030), ("MSE", 4102)]
This is my way of going about it, although I haven't quite gotten it yet:
def makeCourseList(str, location, tokens):
print "before: %s" % tokens
for index, course_number in enumerate(tokens[1:]):
tokens[index + 1] = (tokens[0][0], course_number)
print "after: %s" % tokens
course = Group(DEPT_CODE + COURSE_NUMBER) # .setResultsName("Course")
course_data = (course + ZeroOrMore(Suppress(',') + COURSE_NUMBER)).setParseAction(makeCourseList)
This outputs:
>>> course.parseString("CS 2110")
([(['CS', 2110], {})], {})
>>> course_data.parseString("CS 2110, 4301, 2123, 1110")
before: [['CS', 2110], 4301, 2123, 1110]
after: [['CS', 2110], ('CS', 4301), ('CS', 2123), ('CS', 1110)]
([(['CS', 2110], {}), ('CS', 4301), ('CS', 2123), ('CS', 1110)], {})
Is this the right way to do it, or am I totally off?
Also, the output of isn't quite correct - I want course_data to emit a list of course symbols that are in the same format as each other. Right now, the first course is different from the others. (It has a {}, whereas the others don't.)

This solution memorizes the department when parsed, and emits a (dept,coursenum) tuple when a number is found.
from pyparsing import Suppress,Word,ZeroOrMore,alphas,nums,delimitedList
data = '''\
MSE 2110, 3030, 4102
CSE 1000, 2000, 3000
'''
def memorize(t):
memorize.dept = t[0]
def token(t):
return (memorize.dept,int(t[0]))
course = Suppress(Word(alphas).setParseAction(memorize))
number = Word(nums).setParseAction(token)
line = course + delimitedList(number)
lines = ZeroOrMore(line)
print lines.parseString(data)
Output:
[('MSE', 2110), ('MSE', 3030), ('MSE', 4102), ('CSE', 1000), ('CSE', 2000), ('CSE', 3000)]

Is this the right way to do it, or am
I totally off?
It's one way to do it, though of course there are others (e.g. use as parse actions two bound method -- so the instance the method belongs to can keep state -- one for the dept code and another for the course number).
The return value of the parseString call is harder to bend to your will (though I'm sure sufficiently dark magic will do it and I look forward to Paul McGuire explaining how;-), so why not go the bound-method route as in...:
from pyparsing import *
DEPT_CODE = Regex(r'[A-Z]{2,}').setResultsName("DeptCode")
COURSE_NUMBER = Regex(r'[0-9]{4}').setResultsName("CourseNumber")
class MyParse(object):
def __init__(self):
self.result = None
def makeCourseList(self, str, location, tokens):
print "before: %s" % tokens
dept = tokens[0][0]
newtokens = [(dept, tokens[0][1])]
newtokens.extend((dept, tok) for tok in tokens[1:])
print "after: %s" % newtokens
self.result = newtokens
course = Group(DEPT_CODE + COURSE_NUMBER).setResultsName("Course")
inst = MyParse()
course_data = (course + ZeroOrMore(Suppress(',') + COURSE_NUMBER)
).setParseAction(inst.makeCourseList)
ignore = course_data.parseString("CS 2110, 4301, 2123, 1110")
print inst.result
this emits:
before: [['CS', '2110'], '4301', '2123', '1110']
after: [('CS', '2110'), ('CS', '4301'), ('CS', '2123'), ('CS', '1110')]
[('CS', '2110'), ('CS', '4301'), ('CS', '2123'), ('CS', '1110')]
which seems to be what you require, if I read your specs correctly.

data = '''\
MSE 2110, 3030, 4102
CSE 1000, 2000, 3000'''
def get_courses(data):
for row in data.splitlines():
department, *numbers = row.replace(",", "").split()
for number in numbers:
yield department, number
This would give a generator for the course codes. A list can be made with list() if need be, or you can iterate over it directly.

Sure, everybody loves PyParsing. For easy stuff like this split is sooo much easier to grok:
data = '''\
MSE 2110, 3030, 4102
CSE 1000, 2000, 3000'''
all = []
for row in data.split('\n'):
klass,num_l = row.split(' ',1)
all.extend((klass,int(num)) for num in num_l.split(','))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Get a part of a token after a specific character - python

Related

Convert CSV with array of tuples back to int

Python / Get unique tokens from a file with a exception

write data in to a csv according to headers names, which indicate occurrences of items

Writing numbers into file python

PyParsing: Is this correct use of setParseAction()?

Categories

Resources