Mean of an Array Created from a CSV in Python - python

I am trying to find the mean of an array created from data in a CSV file using Python. Data in the array is included between a range of values, so it does not include all the values in the column of the CSV. My current code that creates the array is shown below. Several arrays have been created, but I only need to find the mean of the array called "T07s". I am consistently getting the error "cannot perform reduce with flexible type" when using the function np.mean(T07s)
import csv
class dataPoint:
def __init__(self, V, T07, T19, T27, Time):
self.V = V
self.T07 = T07
self.T19 = T19
self.T27 = T27
self.Time = Time
dataPoints = []
with open("data_final.csv") as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
if 229 <= float(row[2]) <= 231:
temp = dataPoint(row[1], row[12], row[24], row[32], row[0].split(" ")[1])
dataPoints.append(temp)
T07s = np.array([x.T07 for x in dataPoints])
The data included in T07s is shown below:
for x in T07s:
print(x)
37.2
539
435.6
717.4
587
757.9
861.8
1024.2
325
117.9
136.3
167.8
809
405.3
405.1
112.7
1317.1
1731.8
1080.2
1208.6
1212.6
1363.8
1715.3
2376.4
2563.9
2998.4
2934.7
2862.4
390.8
2332.2
2121
2237.6
2334.1
2082.2
1892.1
1888.8
1960.6
1329.1
1657.2
2042.4
1417.5
977.3
1442.8
561.2
500.3
413.3
324.1
693.7
750
865.7
434.2
635.2
815.7
171.4
829.3
815.3
774.8
1411.6
1685.1
1345.1
1193.2
1674.9
1636.4
1389.8
753.3
1102.8
908.3
1223.2
1199.4
1040.7
1040.9
824.7
620
795.7
810.4
378.8
643.2
441.8
682.8
417.8
515.6
2354.7
1938.8
1512.4
1933.5
1739.8
2281.9
1997.5
2833.4
182.8
202.4
217.3
234.2
741.9

Clearly more of a simple solution:
import pandas as pd
data = pd.read_csv('data_final.csv')
data_filtered = data[data.iloc[:,2] >= 229 & data.iloc[:,2] <= 231]
print(data_filtered['T07'].mean())

Related

How to reshape data in Python?

I have a data set as given below-
Timestamp = 22-05-2019 08:40 :Light = 64.00 :Temp_Soil = 20.5625 :Temp_Air = 23.1875 :Soil_Moisture_1 = 756 :Soil_Moisture_2 = 780 :Soil_Moisture_3 = 1002
Timestamp = 22-05-2019 08:42 :Light = 64.00 :Temp_Soil = 20.5625 :Temp_Air = 23.125 :Soil_Moisture_1 = 755 :Soil_Moisture_2 = 782 :Soil_Moisture_3 = 1002
And I want to Reshape(rearrange) the dataset to orient header columns like [Timestamp, Light, Temp_Soil, Temp_Air, Soil_Moisture_1, Soil_Moisture_2, Soil_Moisture_3] and their values as the row entry in Python.
One of possible solutions:
Instead of a "true" input file, I used a string:
inp="""Timestamp = 22-05-2019 08:40 :Light = 64.00 :TempSoil = 20.5625 :TempAir = 23.1875 :SoilMoist1 = 756 :SoilMoist2 = 780 :SoilMoist3 = 1002
Timestamp = 22-05-2019 08:42 :Light = 64.00 :TempSoil = 20.5625 :TempAir = 23.125 :SoilMoist1 = 755 :SoilMoist2 = 782 :SoilMoist3 = 1002"""
buf = pd.compat.StringIO(inp)
To avoid "folding" of output lines, I shortened field names.
Then let's create the result DataFrame and a list of "rows" to append to it.
For now - both of them are empty.
df = pd.DataFrame(columns=['Timestamp', 'Light', 'TempSoil', 'TempAir',
'SoilMoist1', 'SoilMoist2', 'SoilMoist3'])
src = []
Below is a loop processing input rows:
while True:
line = buf.readline()
if not(line): # EOF
break
lst = re.split(r' :', line.rstrip()) # Field list
if len(lst) < 2: # Skip empty source lines
continue
dct = {} # Source "row" (dictionary)
for elem in lst: # Process fields
k, v = re.split(r' = ', elem)
dct[k] = v # Add field : value to "row"
src.append(dct)
And the last step is to append rows from src to df :
df = df.append(src, ignore_index =True, sort=False)
When you print(df), for my test data, you will get:
Timestamp Light TempSoil TempAir SoilMoist1 SoilMoist2 SoilMoist3
0 22-05-2019 08:40 64.00 20.5625 23.1875 756 780 1002
1 22-05-2019 08:42 64.00 20.5625 23.125 755 782 1002
For now all columns are of string type, so you can change the required
columns to either float or int:
df.Light = pd.to_numeric(df.Light)
df.TempSoil = pd.to_numeric(df.TempSoil)
df.TempAir = pd.to_numeric(df.TempAir)
df.SoilMoist1 = pd.to_numeric(df.SoilMoist1)
df.SoilMoist2 = pd.to_numeric(df.SoilMoist2)
df.SoilMoist3 = pd.to_numeric(df.SoilMoist3)
Note that to_numeric() function is clever enough to recognize the possible
type to convert to, so first 3 columns changed their type to float64
and the next 3 to int64.
You can check it executing df.info().
One more possible conversion is to change Timestamp column
to DateTime type:
df.Timestamp = pd.to_datetime(df.Timestamp)

Displaying results from 3 different For loops into a table

I want to display all three lists side by side with the names associated with the values in a table format. I am manually doing it right now and it's taking a while for all 20 files I must do. Thank you for your help!
maxpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in maxpreandpost:
height = max(i.Z)
print (height)
165.387
160.214
159.118
186.685
163.744
160.717
184.026
171.25099999999995
175.73
156.512
150.339
131.528
148.52100000000004
126.738
136.389
148.334
129.855
153.599
144.595
159.32299999999995
lenpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in lenpreandpost:
duration = len(i.Z)
print (duration)
690
543
292
271
293
147
209
355
230
293
395
256
349
255
335
255
231
243
315
267
dis = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in dis:
p1 = [max(i.X),max(i.Y)]
p2 = [min(i.X),min(i.Y)]
distance = math.sqrt(((p1[0]-p2[0])**2)+((p1[1]-p2[1])**2))
print (distance)
2219.0546989150585
2337.434842606099
1857.1832474809803
1450.0472277998394
1512.6539831504758
1058.5635689541748
1653.517987682021
1854.670452561212
1861.8190476064021
1775.672511965326
1872.275393720069
1814.9932559772114
1852.3299779009246
1875.2281201398403
1867.1599096301313
1708.250531327712
1793.8521780715407
1862.7949271803914
1872.843665022548
1800.2239125453254
Sure, append all values to output lists and then add them to a pandas dataframe:
import pandas as pd
heightmax = []
maxpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in maxpreandpost:
height = max(i.Z)
heightmax.append(height)
duration_pre_post = []
lenpreandpost = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in lenpreandpost:
duration = len(i.Z)
duration_pre_post.append(duration)
dis_p1_p2 = []
dis = [Pre1,Pre2,Pre3,Pre4,Pre5,Pre6,Pre7,Pre8,Pre9,Pre10,Post1,Post2,Post3,Post4,Post5,Post6,Post7,Post8,Post9,Post10]
for i in dis:
p1 = [max(i.X),max(i.Y)]
p2 = [min(i.X),min(i.Y)]
distance = math.sqrt(((p1[0]-p2[0])**2)+((p1[1]-p2[1])**2))
dis_p1_p2.append(distance)
df = pd.DataFrame() # initialize empty dataframe
# Store each list as a column in the df.
df['HeightMax'] = heightmax
df['DurationPrePost'] = duration_pre_post
df['DistanceP1P2'] = dis_p1_p2
#if you want to write this out to a tabular file:
df.to_csv('./Desktop/myDf.csv', sep='\t', index=False)
The output of this would be something like:
HeightMax DurationPrePost DistanceP1P2
0 165.387 690 2219.0546989150585
1 160.214 543 2337.434842606099
2 159.118 292 1857.1832474809803
3 186.685 271 1450.0472277998394
4 163.744 293 1512.6539831504758
... #extends to end of lists

Joining data from two tables with different number of columns

I am new to python and this is a sample code I got online.
I have two big data CSV files one from the database and another from the company metadata. I would like to compare specific columns in both tables and generate a new csv file that shows me where the missing records in the metadata are. Keeping in mind that the two csv files do not have the same number of columns and I want to analyse specific columns in both csv files.
These are the two csv files:
csv1 copied from excel sheet
start_time end_time aitechid hh_village grpdetails1/farmername grpdetails1/farmermobile 2016-11-26T14:01:47.329+03 2016-11-26T14:29:05.042+03 AI00001 2447 KahsuGebru 919115604 2016-11-26T19:34:42.159+03 2016-11-26T20:39:27.430+03 936891238 2473 Moto Aleka 914370833 2016-11-26T12:13:23.094+03 2016-11-26T14:25:19.178+03 914127382 2390 Hagos 914039654 2016-11-30T14:31:28.223+03 2016-11-30T14:56:33.144+03 920784222 384 Mohammed Ali 923456788 2016-11-30T14:22:38.631+03 2016-11-30T15:06:44.199+03 912320358 378 Habtamu Nuru 913856087 2016-11-29T03:41:36.532+03 2016-11-29T16:33:12.632+03 914763134 2301 Are gaining Giday 0 2016-11-29T16:21:05.012+03 2016-11-29T16:37:27.934+03 914763134 2290 G 912345678 2016-11-30T17:23:34.145+03 2016-11-30T18:00:32.142+03 914763134 2291 Haile tesfu 0 2016-11-30T20:37:54.657+03 2016-11-30T20:56:16.472+03 914763134 2300 Negative Abay 933082495 2016-11-30T21:00:22.063+03 2016-11-30T21:18:44.478+03 914763134 2291 Niguel Amare 914270455
csv2 copied from excel sheet
farmermobile 941807851 946741296 9 920212218 915 939555303 961579437 919961811 100004123 972635273 918166831 961579437
I have tried this code but I am not getting the expected output:
import csv
def get_key(row):
return row["!Sample_title"], row["!Sample_geo_accession"]
def load_csv(filename):
"""Put csv data into a dict that maps title/geo to the complete row.
"""
d = {}
with open(filename) as f:
for row in csv.DictReader(f, delimiter=","):
key = get_key(row)
assert key not in d
d[key] = row
return d
def diffs(old, new):
yield from added_or_removed("ADDED", new.keys() - old.keys(), new)
yield from added_or_removed("REMOVED", old.keys() - new.keys(), old)
yield from changed(old, new)
def compare_row(key, old, new):
i = -1
for i, line in enumerate(diffs(old, new)):
if not i:
print("/".join(key))
print(" " + line)
if i >= 0:
print()
def added_or_removed(state, keys, d):
items = sorted((key, d[key]) for key in keys)
for key, value in items:
yield "{:10}: {:30} | {:30}".format(state, key, value)
def changed(old, new):
common_columns = old.keys() & new.keys()
for column in sorted(common_columns):
oldvalue = old[column]
newvalue = new[column]
if oldvalue != newvalue:
yield "{:10}: {:30} | {:30} | {:30}".format(
"CHANGED",
column,
oldvalue.ljust(30),
newvalue.ljust(30))
if __name__ == "__main__":
oldcsv = load_csv("/media/dmogaka/DATA/week4/combine201709.csv")
newcsv = load_csv("/media/dmogaka/DATA/week4/combinedmissingrecords.csv")
# title/geo pairs that occur in both files:
common = oldcsv.keys() & newcsv.keys()
for key in sorted(common):
compare_row(key, oldcsv[key], newcsv[key])

calculating the area of an irregular shape from coordinates in a csv file using python

i am using Python to import a csv file with coordinates in it, passing it to a list and using the contained data to calculate the area of each irregular figure. The data within the csv file looks like this.
ID Name DE1 DN1 DE2 DN2 DE3 DN3
88637 Zack Fay -0.026841782 -0.071375637 0.160878583 -0.231788845 0.191811833 0.396593863
88687 Victory Greenfelder 0.219394372 -0.081932907 0.053054879 -0.048356016
88737 Lynnette Gorczany 0.043632299 0.118916157 0.005488698 -0.268612073
88787 Odelia Tremblay PhD 0.083147337 0.152277791 -0.039216388 0.469656787 -0.21725977 0.073797219
The code i am using is below - however it brings up an IndexError: as the first line doesn't have data in all columns. Is there a way to write the csv file so it only uses the colums with data in them ?
import csv
import math
def main():
try:
# ask user to open a file with coordinates for 4 points
my_file = raw_input('Enter the Irregular Differences file name and location: ')
file_list = []
with open(my_file, 'r') as my_csv_file:
reader = csv.reader(my_csv_file)
print 'my_csv_file: ', (my_csv_file)
reader.next()
for row in reader:
print row
file_list.append(row)
all = calculate(file_list)
save_write_file(all)
except IOError:
print 'File reading error, Goodbye!'
except IndexError:
print 'Index Error, Check Data'
# now do your calculations on the 'data' in the file.
def calculate(my_file):
return_list = []
for row in my_file:
de1 = float(row[2])
dn1 = float(row[3])
de2 = float(row[4])
dn2 = float(row[5])
de3 = float(row[6])
dn3 = float(row[7])
de4 = float(row[8])
dn4 = float(row[9])
de5 = float(row[10])
dn5 = float(row[11])
de6 = float(row[12])
dn6 = float(row[13])
de7 = float(row[14])
dn7 = float(row[15])
de8 = float(row[16])
dn8 = float(row[17])
de9 = float(row[18])
dn9 = float(row[19])
area_squared = abs((dn1 * de2) - (dn2 * de1)) + ((de3 * dn4) - (dn3 * de4)) + ((de5 * dn6) - (de6 * dn5)) + ((de7 * dn8) - (dn7 * de8)) + ((dn9 * de1) - (de9 * dn1))
area = area_squared / 2
row.append(area)
return_list.append(row)
return return_list
def save_write_file(all):
with open('output_task4B.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["ID", "Name", "de1", "dn1", "de2", "dn2", "de3", "dn3", "de4", "dn4", "de5", "dn5", "de6", "dn6", "de7", "dn7", "de8", "dn8", "de9", "dn9", "Area"])
writer.writerows(all)
if __name__ == '__main__':
main()
Any suggestions
Your problem appears to be in the calculate function.
You are trying to access various indexes of row without first confirming they exist. One naive approach might be to consider the values to be zero if they are not present, except that:
+ ((dn9 * de1) - (de9 * dn1)
is an attempt to wrap around, and this might invalidate your math since they would go to zero.
A better approach is probably to use a slice of the row, and use the sequence-iterating approach instead of trying to require a certain number of points. This lets your code fit the data.
coords = row[2:] # skip id and name
assert len(coords) % 2 == 0, "Coordinates must come in pairs!"
prev_de = coords[-2]
prev_dn = coords[-1]
area_squared = 0.0
for de, dn in zip(coords[:-1:2], coords[1::2]):
area_squared += (de * prev_dn) - (dn * prev_de)
prev_de, prev_dn = de, dn
area = abs(area_squared) / 2
The next problem will be dealing with variable length output. I'd suggest putting the area before the coordinates. That way you know it's always column 3 (or whatever).

Doing operations on a large data set

I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.
I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me

Categories

Resources