I have a text file that contains current between multiple ports like:
current from A:
B - 10
C - 6
Current from B:
A - 11
C - 4
Current from C:
A - 5
B - 5
I need to find the avg current between same ports, my output should be like :
current A-B is 10.5
current A-C is 5.5
current B-C is 4.5
I was thinking of using nested key value pair. is there any other way i could solve this in python? The code i was thinking was
import re
pat = re.compile("current from")
current = {}
with open(fileName) as f:
for line in f:
if pat.search(line):
key1 = (line.split()[2])
elif line != "\n" :
current[key1][line.split()[0]].append(line.split()[2])
for key1 in current:
for key2 in current[key1]:
avg = ((current[key1][key2] + current[key2][key1])/2)
print("current " + key1 + "-" + key2 + " is " + str(avg))
how about this
import re, collections
def extraer_data(fname):
with open(fname) as file:
for raw in re.split(r'current from', file.read(), flags= re.IGNORECASE ):
raw = raw.strip()
if raw:
key,rest = raw.split(":")
data = [ (c,int(n)) for c,n in re.findall("(\w+) - (\d+)",rest) ]
yield (key, data)
def process(fname):
data = collections.defaultdict(list)
for p1, ports in extraer_data(fname):
for p2, val in ports:
data[frozenset((p1,p2))].append(val)
for key,val in data.items():
print( "current {} is {}".format("-".join(sorted(key)), sum(val)/len(val)))
as we are using re, lets try using it to its fullest, or at the very least to the best I can :)
first I take the whole file and divide it at current from which give us this
A:
B - 10
C - 6
------------------------------------------
B:
A - 11
C - 4
------------------------------------------
C:
A - 5
B - 5
from there the extraction is more easy, split at : to get the first letter and finall to get the pairs and process them accordingly
>>> list(extraer_data("test.txt"))
[('A', [('B', 10), ('C', 6)]), ('B', [('A', 11), ('C', 4)]), ('C', [('A', 5), ('B', 5)])]
>>>
once we get the data from the file in a format as show above, is the turn to group them in pairs, and as the order is irrelevant I pack them in a frozenset so they can be used as an dictionary key, and for said dictionary I use a defaultdict of list and once that everything is tie in a little nice package, the rest is a piece of cake
>>> process("test.txt")
current A-B is 10.5
current B-C is 4.5
current A-C is 5.5
>>>
Related
staright to the point:
my input:
d = {'Key1': [('aaaaa', '834M', '118G'),
('bbbbb', '216G', '220.3M')],
'Key2': [('ccccc', '790M', '162G'),
('ddddd', '203G', '34.8G'),
('eeeee', '126M', '112G')],
'Key3': [('fffff', '921G', '30.8M'),
('ggggg', '235G', '2.63G')]}
I have this so far and it works but only for G (Gb) values:
for p, vl in pools.items():
alloc = ('{}G'.format(round(sum([float(j[1].split('G')[0]) for j in vl]))))
free = ('{}G'.format(round(sum([float(j[2].split('G')[0]) for j in vl]))))
I need to add values accordingly:
from key1 aaaaa value 834M +
from key1 bbbbb value 216G
then
from key1 aaaaa value 118G +
from key1 bbbbb value 220.3M
and so on for every key
so the output will look like this:
216.8G 118.2G
and so on.
I'll refactor this a bit to make the lines stay within 80 characters long and to improve readability:
def split(j, i):
if 'G' in j[i]:
return float(j[i].split('G')[0])
return float(j[i].split('M')[0]) / 1000
for p, vl in pools.items():
alloc = ('{}G'.format(round(sum([split(j, 1) for j in vl]))))
free = ('{}G'.format(round(sum(split(j, 2) for j in vl]))))
You could also write the split function as:
def split(j, i):
suffix = j[i][-1]
value = float(j[i][:-1])
return value if suffix == 'G' else value / 1000
I suggest you code some functions to parse your input and to create the output the way you want.
A simple example could be:
def toFloat(s):
return float(s.replace("G","e9").replace("M","e6").replace("k","e3"))
def toPString(num):
lv = math.log(num,1000)
prefs = ['k','M','G','T']
return "{:.1f}{}".format(num/(1000**int(lv)),prefs[int(lv)-1])
Then you can do:
for p, vl in d.items():
alloc = toPString(sum([toFloat(l[1]) for l in vl]))
free = toPString(sum([toFloat(l[2]) for l in vl]))
print(p, alloc, free)
It gives me:
Key3 1.2T 2.7G
Key2 203.9G 308.8G
Key1 216.8G 118.2G
Hope that is what you are looking for.
I have a text file which is laid out like this
John, 3,4,5
Barry, 8,5,3
Steve,7,3,2
I want to be able to sort this by reading from a text file in python by highest to lowest for each person then put them in order so that the output would be something like this.
Barry 8
Steve 7
John 5
I also want to sort it based on average so that it will calculate the average from the three scores put it in order then print out the answer like so.
Barry 5.3333
John 4
Steve 4
I hope this helps, or at least provides food for thought. I am starting from your statement, "I know how to read and write from a text file".
Make a dict from the data in the text file:
d = {'John': [3, 4, 5], 'Barry': [8, 5, 3], 'Steve': [7, 3, 2]}
Make an averaging function:
def avg(s):
return float(sum(s)) / len(s) if len(s) > 0 else float('nan')
Make some new dicts to keep the average and maximum scores:
g = {key: max(d[key]) for key in d}
h = {key: avg(d[key]) for key in d}
Then this code will print the results you want:
for key in sorted(g, key=g.get, reverse=True):
print(key, g[key])
for key in sorted(h, key=h.get, reverse=True):
print(key, h[key])
Maybe Pandas is a good direction:
import pandas as pd
txt = '''John, 3,4,5
Barry, 8,5,3
Steve,7,3,2'''
with open('txt_file', 'w') as f:
f .write(txt)
df = pd.read_csv('txt_file',header=None)
df['average'] = df.mean(axis=1)
df.sort_values('average',inplace=True)
print df
output:
0 1 2 3 average
0 John 3 4 5 4.000000
2 Steve 7 3 2 4.000000
1 Barry 8 5 3 5.333333
Maybe something like this
from __future__ import division
from operator import attrgetter
class Player(object):
def __init__(self, name, scores):
self.name = name
self.scores = scores
self.highscore = max(scores)
self.avgscore = sum(scores) / len(scores)
with open('scorez.txt') as f:
players = []
for line in f:
elems = line.split(',')
players.append(Player(elems[0], map(int, elems[1:])))
byhighscore = sorted(players, key=attrgetter('highscore'), reverse=True)
byavg = sorted(players, key=attrgetter('avgscore'), reverse=True)
print('')
for p in byhighscore:
print('{0} {1:g}'.format(p.name, p.highscore))
print('')
for p in byavg:
print('{0} {1:g}'.format(p.name, p.avgscore))
Read the file into a list of players and pre-compute the max score and average (this could be deferred if the list of players and or scores were large). Then just sort on the required attribute and print.
The attrgetter function might seem a bit strange to a beginner, but it basically just returns the given attribute for each element in the list being sorted.
I have two CSV files that I want to compare one looks like this:
"a" 1 6 3 1 8
"b" 15 6 12 5 6
"c" 7 4 1 4 8
"d" 14 8 12 11 4
"e" 1 8 7 13 12
"f" 2 5 4 13 9
"g" 8 6 9 3 3
"h" 5 12 8 2 3
"i" 5 9 2 11 11
"j" 1 9 2 4 9
So "a" possesses the numbers 1,6,3,1,8 etc. The actual CSV file is 1,000s of lines long so you know for efficiency sake when writing the code.
The second CSV file looks like this:
4
15
7
9
2
I have written some code to import these CSV files into lists in python.
with open('winningnumbers.csv', 'rb') as wn:
reader = csv.reader(wn)
winningnumbers = list(reader)
wn1 = winningnumbers[0]
wn2 = winningnumbers[1]
wn3 = winningnumbers[2]
wn4 = winningnumbers[3]
wn5 = winningnumbers[4]
print(winningnumbers)
with open('Entries#x.csv', 'rb') as en:
readere = csv.reader(en)
enl = list(readere)
How would I now search cross reference number 4 so wn1 of CSV file 2 with the first csv file. So that it returns that "b" has wn1 in it. I imported them as a list to see if I could figure out how to do it but just ended up running in circles. I also tried using dict() but had no success.
If I understood you correctly, you want to find the first index (or all indexes) of numbers in entries that are winning. If you want it, you can do that:
with open('winningnumbers.csv', 'rb') as wn:
reader = csv.reader(wn)
winningnumbers = list(reader)
with open('Entries#x.csv', 'rb') as en:
readere = csv.reader(en)
winning_number_index = -1 # Default value which we will print if nothing is found
current_index = 0 # Initial index
for line in readere: # Iterate over entries file
all_numbers_match = True # Default value that will be set to False if any of the elements doesn't match with winningnumbers
for i in range(len(line)):
if line[i] != winningnumbers[i]: # If values of current line and winningnumbers with matching indexes are not equal
all_numbers_match = False # Our default value is set to False
break # Exit "for" without finishing
if all_numbers_match == True: # If our default value is still True (which indicates that all numbers match)
winning_number_index = current_index # Current index is written to winning_number_index
break # Exit "for" without finishing
else: # Not all numbers match
current_index += 1
print(winning_number_index)
This will print the index of the first winning number in entries (if you want all the indexes, write about it in the comments).
Note: this is not the optimal code to solve your problem. It's just easier to undestand and debug if you're not familiar with Python's more advanced features.
You should probably consider not abbreviating your variables. entries_reader takes just a second more to write and 5 seconds less to understand then readere.
This is the variant that is faster, shorter and more memory efficient, but may be harder to understand:
with open('winningnumbers.csv', 'rb') as wn:
reader = csv.reader(wn)
winningnumbers = list(reader)
with open('Entries#x.csv', 'rb') as en:
readere = csv.reader(en)
for line_index, line in enumerate(readere):
if all((line[i] == winningnumbers[i] for i in xrange(len(line)))):
winning_number_index = line_index
break
else:
winning_number_index = -1
print(winning_number_index)
The features that might me unclear are probably enumerate(), any() and using else in for and not in if. Let's go through all of them one by one.
To understand this usage of enumerate, you'll need to understand that syntax:
a, b = [1, 2]
Variables a and b will be assigned according values from the list. In this case a will be 1 and b will be 2. Using this syntax we can do that:
for a, b in [[1, 2], [2, 3], ['spam', 'eggs']]:
# do something with a and b
in each iteration, a and b will be 1 and 2, 2 and 3, 'spam' and 'eggs' accordingly.
Let's assume we have a list a = ['spam', 'eggs', 'potatoes']. enumerate() just returns a "list" like that: [(1, 'spam'), (2, 'eggs'), (3, 'potatoes')]. So, when we use it like that,
for line_index, line in enumerate(readere):
# Do something with line_index and line
line_index will be 1, 2, 3, e.t.c.
any() function accepts a sequence (list, tuple, e.t.c.) and returns True if all the elements in it are equal to True.
Generator expression mylist = [line[i] == winningnumbers[i] for i in range(len(line))] returns a list and is similar to the following:
mylist = []
for i in range(len(line)):
mylist.append(line[i] == winningnumbers[i]) # a == b will return True if a is equal to b
So any will return True only in cases when all the numbers from entry match the winning numbers.
Code in else section of for is called only when for was not interrupted by break, so in our situation it's good for setting a default index to return.
Having duplicate numbers seems illogical but if you want to get the count of matched numbers for each row regardless of index then makes nums a set and sum the times a number from each row is in the set:
from itertools import islice, imap
import csv
with open("in.txt") as f,open("numbers.txt") as nums:
# make a set of all winning nums
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
# iterate over each row and sum how many matches we get
for row in r:
print("{} matched {}".format(row[0], sum(n in nums
for n in islice(row, 1, None))))
Which using your input will output:
a matched 0
b matched 1
c matched 2
d matched 1
e matched 0
f matched 2
g matched 0
h matched 1
i matched 1
j matched 2
presuming your file is comma separated and you have a number per line in your numbers file.
If you actually want to know which numbers if any are present then you need to iterate over the number and print each one that is in our set:
from itertools import islice, imap
import csv
with open("in.txt") as f, open("numbers.txt") as nums:
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
for row in r:
for n in islice(row, 1, None):
if n in nums:
print("{} is in row {}".format(n, row[0]))
print("")
But again, I am not sure having duplicate numbers makes sense.
To group the rows based on how many matches, you can use a dict using the sum as the key and appending the first column value:
from itertools import islice, imap
import csv
from collections import defaultdict
with open("in.txt") as f,open("numbers.txt") as nums:
# make a set of all winning nums
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
results = defaultdict(list)
# iterate over each row and sum how many matches we get
for row in r:
results[sum(n in nums for n in islice(row, 1, None))].append(row[0])
results:
defaultdict(<type 'list'>,
{0: ['a', 'e', 'g'], 1: ['b', 'd', 'h', 'i'],
2: ['c', 'f', 'j']})
The keys are numbers match, the values are the rows ids that matched the n numbers.
I have a large text string and I would like to create a dictionary with a key = a pair of words (have to go through all possible combinations) in the string and the value = frequency of a given pair of words. Thus, it is a 2D matrix and each matrix element is a number (a frequency of the pair from a column and a row crossing each other. The position of the words in the pair is irrelevant: e.g. if ridebike = 4 (a frequency) then bikeride = 4 as well
The end result is to populate the matrix and then select N number of top pairs.
I am new working with text strings and with Python in general and I got hopelessly lost (also way too many loops in my "code")
This is what I have (after deleting stopwords and punctuations):
textNP = 'stopped traffic bklyn bqe 278 wb manhattan brtillary stx29 wb cadman pla hope oufootball makes safe manhattan kansas tomorrow boomersooner beatwildcats theyhateuscuztheyaintus hatersgonnahate rt bringonthecats bring cats exclusive live footage oklahoma trying get manhattan http colktsoyzvvz rt jonfmorse bring cats exclusive live footage oklahoma trying get manhattan'
Some code (incomplete and wrong):
txtU = set(textNP)
lntxt = len(textNP)
lntxtS = len(txtU)
matrixNP = {}
for b1, i1 in txtU:
for b2, i2 in txtU:
if i1< i2:
bb1 = b1+b2
bb2 = b2+b1
freq = 0
for k in textNP:
for j in textNP:
if k < j:
kj = k+j
if kj == bb1 | kj == bb2:
freq +=1
matrixNP[i1][i2] = freq
matrixNP[i2][i1] = freq
elif i1 == i2: matrixNP[i1][i1] = 1
One of the issues that I am certain that having many loops is wrong. Also, I am not sure how to assign calculated keys (concatenation of words) to a dictionary (I think I got the values correctly)
The text string is not a complete product: it will be cleaned from numbers and few other things with various regexs
Your help will be very much appreciated!
Are you looking for all combinations of 2 words, if so you can use itertools.combinations and a collections.Counter to do what you want:
>>> from itertools import combinations
>>> from collections import Counter
>>> N = 5
>>> c = Counter(tuple(sorted(a)) for a in combinations(textNP.split(), 2))
>>> c.most_common(N)
[(('manhattan', 'rt'), 8),
(('exclusive', 'manhattan'), 8),
(('footage', 'manhattan'), 8),
(('manhattan', 'oklahoma'), 8),
(('bring', 'manhattan'), 8)]
Or are you looking for all pairs of consecutive words then you can create a pairwise function:
>>> from itertools import tee
>>> from collections import Counter
>>> def pairwise(iterable):
... a, b = tee(iterable)
... next(b, None)
... return zip(a, b) # itertools.izip() in python2
>>> N = 5
>>> c = Counter(tuple(sorted(a)) for a in pairwise(textNP.split()))
>>> c.most_common(N)
[(('get', 'manhattan'), 2),
(('footage', 'live'), 2),
(('get', 'trying'), 2),
(('bring', 'cats'), 2),
(('exclusive', 'live'), 2)]
Neither way do I see bike ride in the list.
I am new to python and trying to write my dictionary values to a file using Python 2.7. The values in my Dictionary D is a list with at least 2 items.
Dictionary has key as TERM_ID and
value has format [[DOC42, POS10, POS22], [DOC32, POS45]].
It means the TERM_ID (key) lies in DOC42 at POS10, POS22 positions and it also lies in DOC32 at POS45
So I have to write to a new file in the format: a new line for each TERM_ID
TERM_ID (tab) DOC42:POS10 (tab) 0:POS22 (tab) DOC32:POS45
Following code will help you understand what exactly am trying to do.
for key,valuelist in D.items():
#first value in each list is an ID
docID = valuelist[0][0]
for lst in valuelist:
file.write('\t' + lst[0] + ':' + lst[1])
lst.pop(0)
lst.pop(0)
for n in range(len(lst)):
file,write('\t0:' + lst[0])
lst.pop(0)
The output I get is :
TERM_ID (tab) DOC42:POS10 (tab) 0:POS22
DOC32:POS45
I tried using the new line tag as well as commas to continue file writing on the same line at no of places, but it did not work. I fail to understand how the file write really works.
Any kind of inputs will be helpful. Thanks!
#Falko I could not find a way to attach the text file hence here is my sample data-
879\t3\t1
162\t3\t1
405\t4\t1455
409\t5\t1
13\t6\t15
417\t6\t13
422\t57\t1
436\t4\t1
141\t8\t1
142\t4\t145
170\t8\t1
11\t4\t1
184\t4\t1
186\t8\t14
My sample running code is -
with open('sampledata.txt','r') as sample,open('result.txt','w') as file:
d = {}
#term= ''
#docIndexLines = docIndex.readlines()
#form a d with format [[doc a, pos 1, pos 2], [doc b, poa 3, pos 8]]
for l in sample:
tID = -1
someLst = l.split('\\t')
#if len(someLst) >= 2:
tID = someLst[1]
someLst.pop(1)
#if term not in d:
if not d.has_key(tID):
d[tID] = [someLst]
else:
d[tID].append(someLst)
#read the dionary to generate result file
docID = 0
for key,valuelist in d.items():
file.write(str(key))
for lst in valuelist:
file.write('\t' + lst[0] + ':' + lst[1])
lst.pop(0)
lst.pop(0)
for n in range(len(lst)):
file.write('\t0:' + lst[0])
lst.pop(0)
My Output:
57 422:1
3 879:1
162:1
5 409:1
4 405:1455
436:1
142:145
11:1
184:1
6 13:15
417:13
8 141:1
170:1
186:14
Expected output:
57 422:1
3 879:1 162:1
5 409:1
4 405:1455 436:1 142:145 11:1 184:1
6 13:15 417:13
8 141:1 170:1 186:14
You probably don't get the result you're expecting because you didn't strip the newline characters \n while reading the input data. Try replacing
someLst = l.split('\\t')
with
someLst = l.strip().split('\\t')
To enforce the mentioned line breaks in your output file, add a
file.write('\n')
at the very end of your second outer for loop:
for key,valuelist in d.items():
// ...
file.write('\n')
Bottom line: write never adds a line break. If you do see one in your output file, it's in your data.