Need assistance in analyzing data (mean, median, mode, etc) from csv file - python

I am having difficulty with finding the mean, median, mode, counting occurrences of a value within a csv file.
This section of the file is a column of letters 'M' or 'F'
This specific excerpt of code displays a problem I am facing:
I am not sure why the counting variables are not being incremented.
Any assistance would be greatly appreciated
citations2 = open('Non Traffic Citations.csv')
data2 = csv.reader(citations2)
gender = []
for row in data2:
gender.append(row[2])
del gender [0]
male_count = 0
female_count = 0
for item in gender:
# print(item) - shows that the list has values within it
if 'M' == item:
male_count = + 1
if 'F' == item:
female_count = + 1
print(male_count)
print(female_count)

If you are trying to increment the gender counts, you have the syntax incorrect in your loop.
for item in gender:
if 'F' == item:
female_count += 1
elif 'M' == item:
male_count += 1
print(male_count)
print(female_count)

You can use pandas:
import pandas as pd
df=pd.read_csv('Non Traffic Citations.csv')
df.describe()

Related

Unable to count an attribute in a list of objects

Disclaimer: New to python
I am trying to print out a count of attributes from a list of objects, and when i try to run it, the count comes back as zero. The list of object prints fine, however trying to count the countries each student is from is proving difficult.
Below is the txt file i am reading from, the class i have set up and the code. Any help would be greatly appreciated. I have attached a screenshot of the output at the bottom.
(I have had to space of the data from the text file)
B123, Jones, Barry, 24, Wales
B134, Kerry, Jane, 21, Scotland
B456, Smith, Percy, 19, England
B788, Roberts, Mary, 20, England
B543, Brown, Sinead, 22, Scotland
B777, Wilson, Rachel, 24, Wales
B321, Taylor, Peter, 20, England
B448, Anderson, Jill, 18, England
B999, Moore, Misty, 20, Wales
B278, Jackson, Bob, 23, Scotland
class Student:
def __init__(self, student_id, surname, forename, age, country):
self.student_id = student_id
self.surname = surname
self.forename = forename
self.age = age
self.country = country
def printStudentDetails(self):
print("StudentID: ", self.student_id)
print("Surname: ", self.surname)
print("Forename: ", self.forename)
print("Age: ", self.age)
print("Country: ", self.country)
from Student import *
students_list = []
students_text = open("studentsText.txt", "r")
for line in students_text:
split_line = line.split(", ")
students = Student(*split_line)
students_list.append(students)
students_text.close()
def print_students1():
english_count = 0
scotland_count = 0
wales_count = 0
for studentObj in students_list:
studentObj.printStudentDetails()
if studentObj.country == "England":
english_count += 1
elif studentObj.country == "Scotland":
scotland_count += 1
elif studentObj.country == "Wales":
wales_count += 1
print("The amount of students is ", len(students_list))
print("English Students: ", english_count)
print("Scottish Students: ", scotland_count)
print("Welsh Students: ", wales_count)
Output for print(studentObj)
It looks like whitespace characters could be causing the if statements to always return false. Try using the .strip() function:
if studentObj.country.strip() == "England":
english_count += 1
elif studentObj.country.strip() == "Scotland":
scotland_count += 1
elif studentObj.country.strip() == "Wales":
wales_count += 1
The following could be helpful. "blankpaper.txt" is a file with the text you provided pasted in. The program uses a counts dictionary to store the number of observations by country. The program reads the file, line-by-line. For each line, the count for the corresponding country is incremented in counts.
The code is designed so that if you need to make use of information in more than one column (presently we only need information from the last column) then it would be easy to modify. To illustrate this, snippet 2 illustrates how to also compute the min and max ages from the file (while also counting the number of observations by country).
I hope this helps. If there are any questions and/or if there is any way that you think I can help please let me know!
Snippet 1 (counts observations by country)
import csv
# dictionary to store number of observations by country
counts = {"Wales": 0, "England": 0, "Scotland": 0}
with open("blankpaper.txt", newline = '') as f:
# returns reader object used to iterate over lines of f
spamreader = csv.reader(f, delimiter = ',')
# each row read from file is returned as a list of strings
for index_a, row in enumerate(spamreader):
# reversed() returns reverse iterator (start from end of list of str)
for index_b, i in enumerate(reversed(row)):
# if last element of line (where countries are)
if index_b == 0:
for key in counts:
if key in i:
counts[key] += 1
break
break
print(f"Counts: {counts}")
Output
Counts: {'Wales': 3, 'England': 4, 'Scotland': 3}
Snippet 2 (counts observations by country and computes max and min ages)
import csv
# dictionary to store number of observations by country
counts = {"Wales": 0, "England": 0, "Scotland": 0}
with open("blankpaper.txt", newline = '') as f:
# returns reader object used to iterate over lines of f
spamreader = csv.reader(f, delimiter = ',')
# each row read from file is returned as a list of strings
for index_a, row in enumerate(spamreader):
# reversed() returns reverse iterator (start from end of list of str)
for index_b, i in enumerate(reversed(row)):
# if last element of line (where countries are)
if index_b == 0:
for key in counts:
if key in i:
counts[key] += 1
break
continue
# second to last element of line
i = int(i)
# if first line, second to last element
if index_a == 0:
# initialize max_age and min_age
max_age = min_age = i
break
#print(row)
# if encounter an age greater than current max, make that the max
if i > max_age:
max_age = i
# if encounter an age less than current min, make that the min
if i < min_age:
min_age = i
print(f"\nMax age from file: {max_age}")
print(f"Min age from file: {min_age}")
print(f"Counts: {counts}")
Output
Max age from file: 24
Min age from file: 18
Counts: {'Wales': 3, 'England': 4, 'Scotland': 3}

Assign a value to a sub group if condition is met

I would like to create column and assign a number to each team that won and lost in a given 'Rally' (0 for a Loss, 1 for a Win). The last row of each rally will display who won in the 'Points' column.
The image shows how the data is formatted and the desired result is in the 'Outcome' column:
My current code is;
def winLoss(x):
if 'A' in x['Points']:
if x.TeamAB == 'A':
return 1
else:
return 0
elif 'B' in x['Points']:
if x.TeamAB == 'B':
return 1
else:
return 0
df['Outcome'] = df.groupby('Rally').apply(winLoss).any()
Grab the winners for each rally by grouping and taking the last row of Points for each group, then use multiindex to loc filter an assign the Outcome:
winners = pd.MultiIndex.from_frame(
df.groupby(['Rally'])['Points']
.last().str.slice(-1).reset_index()
)
df.set_index(['Rally', 'TeamAB'], inplace=True)
df['Outcome'] = 0
df.loc[df.index.isin(winners), 'Outcome'] = 1
df.reset_index(inplace=True)

Calculate percentages in Python (0% to100%)

At this code, I have a data with lots combinations from 'a' 'b' 'c' 'd' and I am trying to find out how often each combination is existing.(example of the data: abdc, abcc, abcd, abbb, aaaa, abdc,...)
After that I want to have the answer in percentage from 0% to 100% of each letter combination. Also if it's zero.
Example Input:
letters: ['abc','aaa','abb','acc','aac','abc','bbb','ccc','ddd','abc','adc','acd','acd','aac','aad','bba','bab','abb','abc','abd'...]
I get df from this: ( tab_files is the file where get my data)
for i, tab_file in enumerate(tab_files):
df = pd.DataFrame.from_csv(tab_file, sep='\t')
Here is my try:
#letter_l = all combinations of letters (abcd) together
nt_l = "abcd"
letter_l = []
for i1 in nt_l:
for i2 in nt_l:
for i3 in nt_l:
letter = i1+i2+i3
letter_l.append(letter)
#print(letter_l)
#calculates the amount of each letter combination and shows the percentage
x = []
number_per_combination = {}
for b in letter_l:
counter = 0
number_per_combination[b] = 0
for c2 in df.letter:
if c2 == b:
counter +=1
number_per_combination[b] += 1
# amount of each letter combination divided through the whole amount
x.append(counter/(len(df.letter)))
but I get strange percentages as answer... I don't understand why. Can somebody help me?
Output I want: number_per combination
'abc': 20% (40)
'aaa': 10% (20)
'ccd': 0% (0)
'ddd': 3% (6)...
So what you're trying to do is a histogram? Here's a simple way to do it:
input_list = ['a', 'a', 'b', 'b', 'b', 'c']
def histogram(my_list):
result = {}
for item in my_list:
result[item] = result.get(item, 0) + 1
return result
print(str(histogram(input_list)))
The .get() method returns the value for the given key from the dictionary. If the key isn't there, it is inserted and given the value provided in the second argument.
import re
import itertools
data="aaa, abc, aab"
words = re.split(', ',data)
words_count = {}
total_count = len( words )
for word in list(itertools.product(["a","b","c","d"], repeat=3)):
words_count["".join(word)] = 0
for word in words:
words_count[word] = words_count.get(word,0) + 1
for word in words_count:
p = words_count[word]/total_count * 100
print( "%s: %.3f%%\t(%d)" % (word,p,words_count[word]) )

Assigning salespersons to different cities program

I was working on a problem of assigning 8 salespersons to 8 different cities, which are represented in below format.
The column represents Sales person and row represent cities.
1) The condition for assigning are :
1) Only one city per person
2) Once a city is assigned a salesperson , the rows and columns and diagonal cities cannot be assigned to another person
I am not able to recreate an example from memory , sorry for that , but the representation of cities and salesperson is correct.
I thought to avoid rows or columns for similar salesperson , I could use permutations from python which will give me distinct set of cities without overlapping and from there on I could check for diagonal values.
Here is my attempt.
import collections
import itertools
def display (l):
list_count = 0
k = ''
for i in l:
print i
list_count = list_count + 1
if list_count != len(l):
k = k + ','
cities = [1,2,3,4,5,6,7,8]
sales = [1,2,3,4,5,6,7,8]
print_list = []
count = 0
for i in itertools.permutations([1,2,3,4,5,6,7,8],8):
print_list.append(i)
if count == 2:
display(print_list)
#print print_list
#print '\n'
for j in range(8):
print_list.pop()
count = 0
count = count + 1
I am stuck on how to check if a salesperson is in diagonal position to another Salesperson, If someone can extend my approach that would be great , would like any other explanation , would be helpful, I would like python as I am practising in it.

Counting data within ranges in csv

I have some data which I need to break down into manageable chunks. With the following data I need to count the number of times x occurs in column 11 with column 7 being a 1 and how many times the number x occurs in column 11. I need to put them into the first line of a csv. After that I need to count the same thing but with column 11 being the following brackets:
0
">0 but <0.05"
">0.05 but <0.10"
">0.1 but <0.15... all the way up to 1.00"
All of these would ideally be appended to the same new.csv i.e. not the main data csv
Some example raw data that fits the above description (please note a lot of the brackets will contain no data. In which case they would need to return 0,0:
01/01/2002,Data,class1,4,11yo+,4,1,George Smith,0,0,x
01/01/2002,Data,class1,4,11yo+,4,2,Ted James,0,0,x
01/01/2002,Data,class1,4,11yo+,4,3,Emma Lilly,0,0,x
01/01/2002,Data,class1,4,11yo+,4,5,George Smith,0,0,x
02/01/2002,Data,class2,4,10yo+,6,4,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,2,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,5,George Smith,1,2,0.5
02/01/2002,Data,class2,4,10yo+,6,3,Tom Phillips,0,0,x
02/01/2002,Data,class2,4,10yo+,6,1,Emma Lilly,0,1,0
02/01/2002,Data,class2,4,10yo+,6,6,George Smith,1,2,0.5
03/01/2002,Data,class3,4,10yo+,6,6,Ted James,0,1,0
03/01/2002,Data,class3,4,10yo+,6,3,Tom Phillips,0,3,0
03/01/2002,Data,class3,4,10yo+,6,2,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,4,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,1,George Smith,1,4,0.25
03/01/2002,Data,class3,4,10yo+,6,5,Tom Phillips,0,3,0
04/01/2002,Data,class4,2,10yo+,5,3,Emma Lilly,1,2,0.5
04/01/2002,Data,class4,2,10yo+,5,1,Ted James,0,2,0
04/01/2002,Data,class4,2,10yo+,5,2,George Smith,2,7,0.285714286
04/01/2002,Data,class4,2,10yo+,5,4,Emma Lilly,1,2,0.5
04/01/2002,Data,class4,2,10yo+,5,5,Tom Phillips,0,5,0
05/01/2002,Data,class5,4,11yo+,4,1,George Smith,2,8,0.25
05/01/2002,Data,class5,4,11yo+,4,2,Ted James,1,3,0.333333333
05/01/2002,Data,class5,4,11yo+,4,3,Emma Lilly,1,4,0.25
05/01/2002,Data,class5,4,11yo+,4,5,George Smith,2,8,0.25
06/01/2002,Data,class6,4,10yo+,6,4,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,2,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,5,George Smith,3,10,0.3
06/01/2002,Data,class6,4,10yo+,6,3,Tom Phillips,0,6,0
06/01/2002,Data,class6,4,10yo+,6,1,Emma Lilly,1,5,0.2
06/01/2002,Data,class6,4,10yo+,6,6,George Smith,3,10,0.3
07/01/2002,Data,class7,4,10yo+,6,6,Ted James,1,4,0.25
07/01/2002,Data,class7,4,10yo+,6,3,Tom Phillips,0,9,0
07/01/2002,Data,class7,4,10yo+,6,2,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,4,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,1,George Smith,3,12,0.25
07/01/2002,Data,class7,4,10yo+,6,5,Tom Phillips,0,9,0
08/01/2002,Data,class8,2,10yo+,5,3,Emma Lilly,2,6,0.333333333
08/01/2002,Data,class8,2,10yo+,5,1,Ted James,1,5,0.2
08/01/2002,Data,class8,2,10yo+,5,2,George Smith,4,15,0.266666667
08/01/2002,Data,class8,2,10yo+,5,4,Emma Lilly,2,6,0.333333333
08/01/2002,Data,class8,2,10yo+,5,5,Tom Phillips,0,11,0
09/01/2002,Data,class9,4,11yo+,4,1,George Smith,4,16,0.25
09/01/2002,Data,class9,4,11yo+,4,2,Ted James,2,6,0.333333333
09/01/2002,Data,class9,4,11yo+,4,3,Emma Lilly,2,8,0.25
09/01/2002,Data,class9,4,11yo+,4,5,George Smith,4,16,0.25
10/01/2002,Data,class10,4,10yo+,6,4,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,2,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,5,George Smith,5,18,0.277777778
10/01/2002,Data,class10,4,10yo+,6,3,Tom Phillips,0,12,0
10/01/2002,Data,class10,4,10yo+,6,1,Emma Lilly,2,9,0.222222222
10/01/2002,Data,class10,4,10yo+,6,6,George Smith,5,18,0.277777778
11/01/2002,Data,class11,4,10yo+,6,6,Ted James,2,7,0.285714286
11/01/2002,Data,class11,4,10yo+,6,3,Tom Phillips,0,15,0
11/01/2002,Data,class11,4,10yo+,6,2,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,4,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,1,George Smith,5,20,0.25
11/01/2002,Data,class11,4,10yo+,6,5,Tom Phillips,0,15,0
12/01/2002,Data,class12,2,10yo+,5,3,Emma Lilly,3,10,0.3
12/01/2002,Data,class12,2,10yo+,5,1,Ted James,2,8,0.25
12/01/2002,Data,class12,2,10yo+,5,2,George Smith,6,23,0.260869565
12/01/2002,Data,class12,2,10yo+,5,4,Emma Lilly,3,10,0.3
12/01/2002,Data,class12,2,10yo+,5,5,Tom Phillips,0,17,0
13/01/2002,Data,class13,4,11yo+,4,1,George Smith,6,24,0.25
13/01/2002,Data,class13,4,11yo+,4,2,Ted James,3,9,0.333333333
13/01/2002,Data,class13,4,11yo+,4,3,Emma Lilly,3,12,0.25
13/01/2002,Data,class13,4,11yo+,4,5,George Smith,6,24,0.25
14/01/2002,Data,class14,4,10yo+,6,4,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,2,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,5,George Smith,7,26,0.269230769
14/01/2002,Data,class14,4,10yo+,6,3,Tom Phillips,0,18,0
14/01/2002,Data,class14,4,10yo+,6,1,Emma Lilly,3,13,0.230769231
14/01/2002,Data,class14,4,10yo+,6,6,George Smith,7,26,0.269230769
15/01/2002,Data,class15,4,10yo+,6,6,Ted James,3,10,0.3
If anybody can help me achieve this I will truly grateful. If this requires more detail please ask.
One last note the csv in question has main data csv in question has 800k rows.
EDIT
Currently the output file appears as follows using the code supplied by #user650654:
data1,data2
If at all possible I would like the code changed slightly to out put two more things. Hopefully therse are not too difficult to do. Proposed changes to output file (commas represent each new row):
title row labeling the row (e.g. "x" or "0:0.05",Calculated avereage of values within each bracket e.g."0.02469",data1,data2
So in reality it would probably look like this:
x,n/a,data1,data2
0:0.05,0.02469,data1,data2
0.05:0.1,0.5469,data1,data2
....
....
Column1 = Row label (The data ranges that are being counted in the original question i.e. from 0 to 0.05
Column2 = Calculated average of values that fell within a particular range. I.e. If the
Note the data1 & data2 are the two values the question innitially asked for.
Column1
Many thanks AEA
Here is a solution for adding the two new fields:
import csv
import numpy
def count(infile='data.csv', outfile='new.csv'):
bins = numpy.arange(0, 1.05, 0.05)
total_x = 0
col7one_x = 0
total_zeros = 0
col7one_zeros = 0
all_array = []
col7one_array = []
with open(infile, 'r') as fobj:
reader = csv.reader(fobj)
for line in reader:
if line[10] == 'x':
total_x += 1
if line[6] == '1':
col7one_x += 1
elif line[10] == '0':
# assumes zero is represented as "0" and not as say, "0.0"
total_zeros += 1
if line[6] == '1':
col7one_zeros += 1
else:
val = float(line[10])
all_array.append(val)
if line[6] == '1':
col7one_array.append(val)
all_array = numpy.array(all_array)
hist_all, edges = numpy.histogram(all_array, bins=bins)
hist_col7one, edges = numpy.histogram(col7one_array, bins=bins)
bin_ranges = ['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])]
digitized = numpy.digitize(all_array, bins)
bin_means = [all_array[digitized == i].mean() if hist_all[i - 1] else 'n/a' for i in range(1, len(bins))]
with open(outfile, 'w') as fobj:
writer = csv.writer(fobj)
writer.writerow(['x', 'n/a', col7one_x, total_x])
writer.writerow(['0', 0 if total_zeros else 'n/a', col7one_zeros, total_zeros])
for row in zip(bin_ranges, bin_means, hist_col7one, hist_all):
writer.writerow(row)
if __name__ == '__main__':
count()
This might work:
import numpy as np
import pandas as pd
column_names = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6',
'col7', 'col8', 'col9', 'col10', 'col11'] #names to be used as column labels. If no names are specified then columns can be refereed to by number eg. df[0], df[1] etc.
df = pd.read_csv('data.csv', header=None, names=column_names) #header= None means there are no column headings in the csv file
df.ix[df.col11 == 'x', 'col11']=-0.08 #trick so that 'x' rows will be grouped into a category >-0.1 and <= -0.05. This will allow all of col11 to be treated as a numbers
bins = np.arange(-0.1, 1.0, 0.05) #bins to put col11 values in. >-0.1 and <=-0.05 will be our special 'x' rows, >-0.05 and <=0 will capture all the '0' values.
labels = np.array(['%s:%s' % (x, y) for x, y in zip(bins[:-1], bins[1:])]) #create labels for the bins
labels[0] = 'x' #change first bin label to 'x'
labels[1] = '0' #change second bin label to '0'
df['col11'] = df['col11'].astype(float) #convert col11 to numbers so we can do math on them
df['bin'] = pd.cut(df['col11'], bins=bins, labels=False) # make another column 'bins' and put in an integer representing what bin the number falls into.Later we'll map the integer to the bin label
df.set_index('bin', inplace=True, drop=False, append=False) #groupby is meant to run faster with an index
def count_ones(x):
"""aggregate function to count values that equal 1"""
return np.sum(x==1)
dfg = df[['bin','col7','col11']].groupby('bin').agg({'col11': [np.mean], 'col7': [count_ones, len]}) # groupby the bin number and apply aggregate functions to specified column.
dfg.index = labels[dfg.index]# apply labels to bin numbers
dfg.ix['x',('col11', 'mean')]='N/A' #mean of 'x' rows is meaningless
print(dfg)
dfg.to_csv('new.csv')
which gave me
col7 col11
count_ones len mean
x 1 7 N/A
0 2 21 0
0.15:0.2 2 2 0.2
0.2:0.25 9 22 0.2478632
0.25:0.3 0 13 0.2840755
0.3:0.35 0 5 0.3333333
0.45:0.5 0 4 0.5
This solution uses numpy.histogram. See below.
import csv
import numpy
def count(infile='data.csv', outfile='new.csv'):
total_x = 0
col7one_x = 0
total_zeros = 0
col7one_zeros = 0
all_array = []
col7one_array = []
with open(infile, 'r') as fobj:
reader = csv.reader(fobj)
for line in reader:
if line[10] == 'x':
total_x += 1
if line[6] == '1':
col7one_x += 1
elif line[10] == '0':
# assumes zero is represented as "0" and not as say, "0.0"
total_zeros += 1
if line[6] == '1':
col7one_zeros += 1
else:
val = float(line[10])
all_array.append(val)
if line[6] == '1':
col7one_array.append(val)
bins = numpy.arange(0, 1.05, 0.05)
hist_all, edges = numpy.histogram(all_array, bins=bins)
hist_col7one, edges = numpy.histogram(col7one_array, bins=bins)
with open(outfile, 'w') as fobj:
writer = csv.writer(fobj)
writer.writerow([col7one_x, total_x])
writer.writerow([col7one_zeros, total_zeros])
for row in zip(hist_col7one, hist_all):
writer.writerow(row)
if __name__ == '__main__':
count()

Categories

Resources