Time complexity of a huge list in python 2.7 - python

I've a list which has approximately 177071007 items.
and i'm trying to perform the following operations
a) get the first and last occurance of a unique item in the list.
b) the number of occurances.
def parse_data(file, op_file_test):
ins = csv.reader(open(file, 'rb'), delimiter = '\t')
pc = list()
rd = list()
deltas = list()
reoccurance = list()
try:
for row in ins:
pc.append(int(row[0]))
rd.append(int(row[1]))
except:
print row
pass
unique_pc = set(pc)
unique_pc = list(unique_pc)
print "closing file"
#takes a long time from here!
for a in range(0, len(unique_pc)):
index_first_occurance = pc.index(unique_pc[a])
index_last_occurance = len(pc) - 1 - pc[::-1].index(unique_pc[a])
delta_rd = rd[index_last_occurance] - rd[index_first_occurance]
deltas.append(int(delta_rd))
reoccurance.append(pc.count(unique_pc[a]))
print unique_pc[a] , delta_rd, reoccurance[a]
print "printing to file"
map_file = open(op_file_test,'a')
for a in range(0, len(unique_pc)):
print >>map_file, "%d, %d, %d" % (unique_pc[a], deltas[a], reoccurance)
map_file.close()
However the complexity is in the order of O(n).
Would there be a possibility to make the for loop 'run fast', by that i mean, do you think yielding would make it fast? or is there any other way? unfortunately, i don't have numpy

Try the following:
from collections import defaultdict
# Keep a dictionary of our rd and pc values, with the value as a list of the line numbers each occurs on
# e.g. {'10': [1, 45, 79]}
pc_elements = defaultdict(list)
rd_elements = defaultdict(list)
with open(file, 'rb') as f:
line_number = 0
csvin = csv.reader(f, delimiter='\t')
for row in csvin:
try:
pc_elements[int(row[0])].append(line_number)
rd_elements[int(row[1])].append(line_number)
line_number += 1
except ValueError:
print("Not a number")
print(row)
line_number += 1
continue
for pc, indexes in pc_elements.iteritems():
print("pc {0} appears {1} times. First on row {2}, last on row {3}".format(
pc,
len(indexes),
indexes[0],
indexes[-1]
))
This works by creating a dictionary, when reading the TSV with the pc value as the the key and a list of occurrences as the value. By the nature of a dict the key must be unique so we avoid the set and the list values are only being used to keep the rows that key occurs on.
Example:
pc_elements = {10: [4, 10, 18, 101], 8: [3, 12, 13]}
would output:
"pc 10 appears 4 times. First on row 4, last on row 101"
"pc 8 appears 3 times. First on row 3, last on row 13"

As you scan items from your input file, put the items into a collections.defaultdict(list) where the key is the item and the value is a list of occurence indices. It will take linear time to read the file and build up this data structure and constant time to get the first and last occurrence index of an item, and constant time to get the number of occurrences of an item.
Here's how it might work
mydict = collections.defaultdict(list)
for item, index in itemfilereader: # O(n)
mydict[item].append(index)
# first occurrence of item, O(1)
mydict[item][0]
# last occurrence of item, O(1)
mydict[item][-1]
# number of occurrences of item, O(1)
len(mydict[item])

Maybe it's worth chaning the data structure used. I'd use a dict that uses pc as key and the occurence as values.
lookup = dict{}
counter = 0
for line in ins:
values = lookup.setdefault(int(line[0]),[])
values.append(tuple(counter,int(line[1])))
counter += 1
for key, val in lookup.iteritems():
value_of_first_occurence = lookup[key][1][1]
value_of_last_occurence = lookup[key][-1][1]
first_occurence = lookup[key][1][0]
last_occurence = lookup[key][-1][0]
value = lookup[key][0]

Try replacing list by dicts, lookup in a dict is much faster than in a long list.
That could be something like this:
def parse_data(file, op_file_test):
ins = csv.reader(open(file, 'rb'), delimiter = '\t')
# Dict of pc -> [rd first occurence, rd last occurence, list of occurences]
occurences = {}
for i in range(0, len(ins)):
row = ins[i]
try:
pc = int(row[0])
rd = int(row[1])
except:
print row
continue
if pc not in occurences:
occurences[pc] = [rd, rd, i]
else:
occurences[pc][1] = rd
occurences[pc].append(i)
# (Remove the sorted is you don't need them sorted but need them faster)
for value in sorted(occurences.keys()):
print "value: %d, delta: %d, occurences: %s" % (
value, occurences[value][1] - occurences[value][0],
", ".join(occurences[value][2:])

Related

Splitting a special type of list data and saving data into two separate dataframe using condition in python

Want to seperate a list data into two parts based on condition. If the value is less than "H1000", we want in a first dataframe(Output for list 1) and if it is greater or equal to "H1000" we want in a second dataframe(Output for list2). First column starts the value with H followed by a four numbers.
Here is my python code:
with open(fn) as f:
text = f.read().strip()
print(text)
lines = [[(Path(fn.name), line_no + 1, col_no + 1, cell) for col_no, cell in enumerate(
re.split('\t', l.strip())) if cell != ''] for line_no, l in enumerate(re.split(r'[\r\n]+', text))]
print(lines)
if (lines[:][:][3] == "H1000"):
list1
list2
I am not able to write a python logic to divide the list data into two parts.
Attach python code & file here
So basically you want to check if the number after the H is greater or not than 1000 right? If I'm right then just do like this:
with open(fn) as f:
text = f.read().strip()
print(text)
lines = [[(Path(fn.name), line_no + 1, col_no + 1, cell) for col_no, cell in enumerate(
re.split('\t', l.strip())) if cell != ''] for line_no, l in enumerate(re.split(r'[\r\n]+', text))]
print(lines)
value = lines[:][:][3]
if value[1:].isdigit():
if (int(value[1:]) < 1000):
#list 1
else:
#list 2
we simply take the numerical part of the factor "hxxxx" with the slices, convert it into an integer and compare it with 1000
with open(fn) as f:
text = f.read().strip()
lines =text.split('\n')
list1=[]
list2=[]
for i in lines:
if int(i.split(' ')[0].replace("H",""))>=1000:
list2.append(i)
else:
list1.append(i)
print(list1)
print("***************************************")
print(list2)
I'm not sure exactly where the problem lies. Assuming you read the above text file line by line, you can simply make use of str.__le__ to check your condition, e.g.
lines = """
H0002 Version 3
H0003 Date_generated 5-Aug-81
H0004 Reporting_period_end_date 09-Jun-99
H0005 State WAA
H0999 Tene_no/Combined_rept_no E79/38975
H1001 Tene_holder Magnetic Resources NL
""".strip().split("\n")
# Or
# with open(fn) as f: lines = f.readlines()
list_1, list_2 = [], []
for line in lines:
if line[:6] <= "H1000":
list_1.append(line)
else:
list_2.append(line)
print(list_1, list_2, sep="\n")
# ['H0002 Version 3', 'H0003 Date_generated 5-Aug-81', 'H0004 Reporting_period_end_date 09-Jun-99', 'H0005 State WAA', 'H0999 Tene_no/Combined_rept_no E79/38975']
# ['H1001 Tene_holder Magnetic Resources NL']

Reading a text file to print frequency of letters in decreasing order - Python 3

I am doing python basic challenges this is one of them. What all I needed to do is to read through a file and print out the frequency of letters in decreasing order. I am able to do this but I wanted to enhance the program by also printing out the frequency percentage alongside with the letter - frequency - freq%. Something like this: o - 46 - 10.15%
This is what I did so far:
def exercise11():
import string
while True:
try:
fname = input('Enter the file name -> ')
fop = open(fname)
break
except:
print('This file does not exists. Please try again!')
continue
counts = {}
for line in fop:
line = line.translate(str.maketrans('', '', string.punctuation))
line = line.translate(str.maketrans('', '', string.whitespace))
line = line.translate(str.maketrans('', '', string.digits))
line = line.lower()
for ltr in line:
if ltr in counts:
counts[ltr] += 1
else:
counts[ltr] = 1
lst = []
countlst = []
freqlst = []
for ltrs, c in counts.items():
lst.append((c, ltrs))
countlst.append(c)
totalcount = sum(countlst)
for ec in countlst:
efreq = (ec/totalcount) * 100
freqlst.append(efreq)
freqlst.sort(reverse=True)
lst.sort(reverse=True)
for ltrs, c, in lst:
print(c, '-', ltrs)
exercise11()
As you can see I am able to calculate and sort the freq% on a different list but I am not able to include it in the tuple of the lst[] list alongside with the letter, freq. Is there any way to solve this problem?
Also if you have any other suggestions for my code. Please do mention.
Output Screen
Modification
Applying a simple modification as mentioned by #wwii I got the desired output. All I had to do is add one more parameter to the print statement while iterating the lst[] list. Previously I tried to make another list for the freq%, sort and then tried to insert it to the letters-count tuple in a list which didn't work out.
for ltrs, c, in lst:
print(c, '-', ltrs, '-', round(ltrs/totalcount*100, 2), '%')
Output Screen
Your count data is in a dictionary of {letter:count} pairs.
You can use the dictionary to calculate the total count like this:
total_count = sum(counts.values())
Then don't calculate the percentage till you are iterating over the counts...
for letter, count in counts.items():
print(f'{letter} - {count} - {100*count/total}') #Python v3.6+
#print('{} - {} - {}'.format(letter, count, 100*count/total) #Python version <3.6+
Or if you want to put it all in a list so you can sort it:
data = []
for letter, count in counts.items():
data.append((letter,count,100*count/total)
Using operator.itemgetter for the sort key function can help code readability.
import operator
letter = operator.itemgetter(0)
count = operator.itemgetter(1)
frequency = operator.itemgetter(2)
data.sort(key=letter)
data.sort(key=count)
data.sort(key=frequency)
Tuples are immutable which is probably the issue you are finding. The other issue is the simple form of the sort function; A more-advanced sort function would serve you well. See below:
The list-of-tuples format of lst, but because tuples are immutable whereas lists are mutable, opting to change lst to a list-of-lists is a valid approach. Then, since lst is a list-of-lists with each element consisting of 'letter,count,frequency%', the sort function with lambda can be used to sort by whichever index you'd like. The following is to be inserted after your for line in fop: loop.
lst = []
for ltrs, c in counts.items():
lst.append([ltrs,c])
totalcount = sum([x[1] for x in lst]) # sum all 'count' values in a list comprehension
for elem in lst:
elem.append((elem[1]/totalcount)*100) # now that each element in 'lst' is a mutable list, you can append the calculated frequency to the respective element in lst
lst.sort(reverse=True,key=lambda lst:lst[2]) # sort in-place in reverse order by index 2.
The items in freqlst,countlist, and lst are related to each other by their position. If any are sorted that relationship is lost.
zipping the lists together before sorting will maintain the relationship.
Will pick up from your list initialization lines.
lst = []
countlst = []
freqlst = []
for ltr, c in counts.items():
#change here, lst now only contains letters
lst.append(ltr)
countlst.append(c)
totalcount = sum(countlst)
for ec in countlst:
efreq = (ec/totalcount) * 100
freqlst.append(efreq)
#New stuff here: Note this only works in python 3+
zipped = zip(lst, countlst, freqlst)
zipped = sorted(zipped, key=lambda x: x[1])
for ltr, c, freq in zipped:
print("{} - {} - {}%".format(ltr, c, freq)) # love me the format method :)
Basically, zip combines lists together into a list of tuples. Then you can use a lambda function to sort those tuples (very common stack question)
I think I was able to achieve what you wanted by using lists instead of tuples. Tuples cannot be modified, but if you really want to know how click here
(I also added the possibility to quit the program)
Important: Never forget to comment your code
The code:
def exercise11():
import string
while True:
try:
fname = input('Enter the file name -> ')
print('Press 0 to quit the program') # give the User the option to quit the program easily
if fname == '0':
break
fop = open(fname)
break
except:
print('This file does not exists. Please try again!')
continue
counts = {}
for line in fop:
line = line.translate(str.maketrans('', '', string.punctuation))
line = line.translate(str.maketrans('', '', string.whitespace))
line = line.translate(str.maketrans('', '', string.digits))
line = line.lower()
for ltr in line:
if ltr in counts:
counts[ltr] += 1
else:
counts[ltr] = 1
lst = []
countlst = []
freqlst = []
for ltrs, c in counts.items():
# add a zero as a place holder &
# use square brakets so you can use a list that you can modify
lst.append([c, ltrs, 0])
countlst.append(c)
totalcount = sum(countlst)
for ec in countlst:
efreq = (ec/totalcount) * 100
freqlst.append(efreq)
freqlst.sort(reverse=True)
lst.sort(reverse=True)
# count the total of the letters
counter = 0
for ltrs in lst:
counter += ltrs[0]
# calculate the percentage for each letter
for letter in lst:
percentage = (letter[0] / counter) * 100
letter[2] += float(format(percentage, '.2f'))
for i in lst:
print('The letter {} is repeated {} times, which is {}% '.format(i[1], i[0], i[2]))
exercise11()
<?php
$fh = fopen("text.txt", 'r') or die("File does not exist");
$line = fgets($fh);
$words = count_chars($line, 1);
foreach ($words as $key=>$value)
{
echo "The character <b>' ".chr($key)." '</b> was found <b>$value</b> times. <br>";
}
?>

Frequencies in Python

def frequencies(data):
data.sort()
count = 0
previous = data[0]
print("data\tfrequency") # '\t' is the TAB character
for d in data:
if d == previous:
# same as the previous, so just increment the count
count += 1
else:
# we've found a new item so print out the old and reset the count
print(str(previous) + "\t" + str(count))
count = 1
previous = d
So I have this frequency code, but its leaving off the last number in my list every time.
It may have something to do with where I start previous or possibly where I reset previous to d at the end.
For the last group of elements, you never print them out, because you never find something different after it. You would need to repeat the printout thing after your loop.
But that is rather academic; in real world, you would be much more likely to use Counter:
from collections import Counter
counter = Counter(data)
for key in counter:
print("%s\t%d" % (key, counter[key]))
You can count items in a list/sequence using count. So your code can be simplified to look like this:
def frequencies(data):
unique_items = set(data)
for item in unique_items:
print('%s\t%s' % (item, data.count(item)))

Need help understanding why this value is staying as 1? Python CSV

So this block of code is supposed to open the csv file, get the values from column 1-3 (not 0). Once it has got the values for each row and their 3 columns, it is supposed to add these values up and divide by 3. I thought this code would work however the addition of the 3 columns in each row doesn't seem to be working. If anyone could tell me why and how i can fix this, that would be great, thank you. I'm pretty certain the problem lies at the for index, summedValue in enumerate (sums): Specifically, the "summedValue" value.
if order ==("average score"):
askclass = str(input("what class?"))
if askclass == ('1'):
with open("Class1.csv") as f:
columns = f.readline().strip().split(" ")
sums = [1] * len(columns)
for line in f:
# Skip empty lines
if not line.strip():
continue
values = line.split(" ")
for i in range(1,len(values)):
sums[i] += int(values[i])
for index, summedValues in enumerate (sums):
print (columns[index], 1.0 * (summedValues) / 3)
from statistics import mean
import csv
with open("Class1.csv") as f:
# create reader object
r = csv.reader(f)
# skip headers
headers = next(r)
# exract name from row and use statistics.mean to average from row[1..
# mapping scores to ints
avgs = ((row[0], mean(map(int, row[1:]))) for row in r)
# unpack name and average and print
for name, avg in avgs:
print(name,avg)
Unless you have written empty lines to your csv file there won't be any, not sure how the header fits into it but you can use it if necessary.
You can also unpack with the * syntax in python 3 which I think is a bit nicer:
avgs = ((name, mean(map(int, row))) for name, *row in r)
for name, avg in avgs:
print(name,avg)
To order just sort by the average using reverse=True to sort from highest to lowest:
from statistics import mean
import csv
from operator import itemgetter
with open("Class1.csv") as f:
r = csv.reader(f)
avgs = sorted(((name, mean(map(int, row))) for name, *row in r),key=itemgetter(1),reverse=True)
for name, avg in avgs:
print(name,avg)
Passing key=itemgetter(1) means we sort by the second subelement which is the average in each tuple.
using
1, 2, 3
4, 2, 3
4, 5, 3
1, 6, 3
1, 6, 6
6, 2, 3
as Class1.csv
and
askclass = str(input("what class?"))
if askclass == ('1'):
with open("Class1.csv") as f:
columns = f.readline().strip().split(",")
sums = [1] * len(columns)
for line in f:
# Skip empty lines
if not line.strip():
continue
values = line.split(",")
for i in range(1,len(values)):
sums[i] += int(values[i])
for index, summedValues in enumerate (sums):
print (columns[index], 1.0 * (summedValues) / 3)
I obtain the expected result:
what class?1
('1', 0.3333333333333333)
(' 2', 7.333333333333333)
(' 3', 6.333333333333333)
[update] Observations:
sums defined ad sums = [1] * len(columns) has length columns, but you ignore first column in you operations so value for sum[0] will always be 1, do not seems necessary.
for float division it is sufficient summedValues / 3.0 instead of 1.0 * (summedValues) / 3
Maybe this is what you want
for line in f:
# Skip empty lines
if not line.strip():
continue
values = line.split(" ")
for i in range(1,len(values)):
sums[i] += int(values[i])
for index, summedValues in enumerate (sums):
print (columns[index], 1.0 * (summedValues) / 3)

Converting string to float

I am trying to write a program that tallies the values in a file. For example, I am given a file with numbers like this
2222 (First line)
4444 (Second line)
1111 (Third line)
My program takes in the name of an input file (E.G. File.txt), and the column of numbers to tally. So for example, if my file.txt contains the number above and i need the sum of column 2, my function should be able to print out 7(2+4+1)
t1 = open(argv[1], "r")
number = argv[2]
k = 0
while True:
n = int(number)
t = t1.readline()
z = list(t)
if t == "":
break
k += float(z[n])
t1.close()
print k
This code works for the first column when I set it to 0, but it doesn't return a consistent result when I set it to 1 even though they should be the same answer.
Any thoughts?
A somewhat uglier implementation that demonstrates the cool-factor of zip:
def sum_col(filename, colnum):
with open(filename) as inf:
columns = zip(*[line.strip() for line in inf])
return sum([int(num) for num in list(columns)[colnum]])
zip(*iterable) flips from row-wise to columnwise, so:
iterable = ['aaa','bbb','ccc','ddd']
zip(*iterable) == ['abcd','abcd','abcd'] # kind of...
zip objects aren't subscriptable, so we need to cast as list before we subscript it (doing [colnum]). Alternatively we could do:
...
for _ in range(colnum-1):
next(columns) # skip the columns we don't need
return sum([int(num) for num in next(columns)])
Or just calculate all the sums and grab the sum that we need
...
col_sums = [sum(int(num) for num in column) for column in columns]
return col_sums[colnum]

Categories

Resources