Average data based on specific columns - python - python

I have a data file with multiple rows, and 8 columns - I want to average column 8 of rows that have the same data on columns 1, 2, 5 - for example my file can look like this:
564645 7371810 0 21642 1530 1 2 30.8007
564645 7371810 0 21642 8250 1 2 0.0103
564645 7371810 0 21643 1530 1 2 19.3619
I want to average the last column of the first and third row since columns 1-2-5 are identical;
I want the output to look like this:
564645 7371810 0 21642 1530 1 2 25.0813
564645 7371810 0 21642 8250 1 2 0.0103
my files (text files) are pretty big (~10000 lines) and redundant data (based on the above rule) are not in regular intervals - so I want the code to find the redundant data, and average them...
in response to larsks comment - here are my 4 lines of code...
import os
import numpy as np
datadirectory = input('path to the data directory, ')
os.chdir( datadirectory)
##READ DATA FILE AND CREATE AN ARRAY
dataset = open(input('dataset_to_be_used, ')).readlines()
data = np.loadtxt(dataset)
##Sort the data based on common X, Y and frequency
datasort = np.lexsort((data[:,0],data[:,1],data[:,4]))
datasorted = data[datasort]

you can use pandas to do this quickly:
import pandas as pd
from StringIO import StringIO
data = StringIO("""564645 7371810 0 21642 1530 1 2 30.8007
564645 7371810 0 21642 8250 1 2 0.0103
564645 7371810 0 21643 1530 1 2 19.3619
""")
df = pd.read_csv(data, sep="\\s+", header=None)
df.groupby(["X.1","X.2","X.5"])["X.8"].mean()
the output is:
X.1 X.2 X.5
564645 7371810 1530 25.0813
8250 0.0103
Name: X.8
if you don't need index, you can call:
df.groupby(["X.1","X.2","X.5"])["X.8"].mean().reset_index()
this will give the result as:
X.1 X.2 X.5 X.8
0 564645 7371810 1530 25.0813
1 564645 7371810 8250 0.0103

Ok, based on Hury's input I updated the code -
import os #needed system utils
import numpy as np# for array data processing
import pandas as pd #import the pandas module
datadirectory = input('path to the data directory, ')
working = os.environ.get("WORKING_DIRECTORY", datadirectory)
os.chdir( working)
##READ DATA FILE AND and convert it to string
dataset = open(input('dataset_to_be_used, ')).readlines()
data = ''.join(dataset)
df = pd.read_csv(data, sep="\\s+", header=None)
sorted_data = df.groupby(["X.1","X.2","X.5"])["X.8"].mean().reset_index()
tuple_data = [tuple(x) for x in sorted_data.values]
datas = np.asarray(tuple_data)
this worked with the test data, as posted by hury - but when I use my file after the df = ... does not seem to work (I get an output like:
Traceback (most recent call last):
File "/media/DATA/arxeia/Programming/MyPys/data_refine_average.py", line 31, in
df = pd.read_csv(data, sep="\s+", header=None)
File "/usr/lib64/python2.7/site-packages/pandas/io/parsers.py", line 187, in read_csv
return _read(TextParser, filepath_or_buffer, kwds)
File "/usr/lib64/python2.7/site-packages/pandas/io/parsers.py", line 141, in _read
f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding)
File "/usr/lib64/python2.7/site-packages/pandas/core/common.py", line 673, in _get_handle
f = open(path, mode)
IOError: [Errno 36] File name too long: '564645\t7371810\t0\t21642\t1530\t1\t2\t30.8007\r\n564645\t7371810\t0\t21642\t8250\t1\t2\t0.0103\r\n564645\t7371810\t0\t21642\t20370\t1\t2\t0.0042\r\n564645\t7371810\t0\t21642\t33030\t1\t2\t0.0026\r\n564645\t7371810\t0\t21642\t47970\t1\t2\t0.0018\r\n564645\t7371810\t0\t21642\t63090\t1\t2\t0.0013\r\n564645\t7371810\t0\t21642\t93090\t1\t2\t0.0009\r\n564645\t7371810\t0\t216..........
any ideas?

It's not the most elegant of answers, and I have no idea how fast/efficient it is, but I believe it gets the job done based on the information you provided:
import numpy
data_file = "full_location_of_data_file"
data_dict = {}
for line in open(data_file):
line = line.rstrip()
columns = line.split()
entry = [columns[0], columns[1], columns[4]]
entry = "-".join(entry)
try: #valid if have already seen combination of 1,2,5
x = data_dict[entry].append(float(columns[7]))
except (KeyError): #KeyError the first time you see a combination of columns 1,2,5
data_dict[entry] = [float(columns[7])]
for entry in data_dict:
value = numpy.mean(data_dict[entry])
output = entry.split("-")
output.append(str(value))
output = "\t".join(output)
print output
I'm unclear if you want/need columns 3, 6, or 7 so I omited them. Particularly, you do not make clear how you want to deal with different values which may exist within them. If you can elaborate on what behavior you want (ie default to a certain value, or to the first occurrence) I'd suggest either filling in with default values or store the first instance in a dictionary of dictionaries rather than a dictionary of lists.

import os #needed system utils
import numpy as np# for array data processing
datadirectory = '/media/DATA/arxeia/Dimitris/Testing/12_11'
working = os.environ.get("WORKING_DIRECTORY", datadirectory)
os.chdir( working)
##HERE I WAS TRYING TO READ THE FILE, AND THEN USE THE NAME OF THE STRING IN THE FOLLOWING LINE - THAT RESULTED IN THE SAME ERROR DESCRIBED BELOW (ERROR # 42 (I think) - too large name)
data_dict = {} #Create empty dictionary
for line in open('/media/DATA/arxeia/Dimitris/Testing/12_11/1a.dat'): ##above error resolved when used this
line = line.rstrip()
columns = line.split()
entry = [columns[0], columns[1], columns[4]]
entry = "-".join(entry)
try: #valid if have already seen combination of 1,2,5
x = data_dict[entry].append(float(columns[7]))
except (KeyError): #KeyError the first time you see a combination of columns 1,2,5
data_dict[entry] = [float(columns[7])]
for entry in data_dict:
value = np.mean(data_dict[entry])
output = entry.split("-")
output.append(str(value))
output = "\t".join(output)
print output
MY OTHER PROBLEM NOW IS GETTING OUTPUT IN STRING FORMAT (OR ANY FORMAT) - THEN I BELIEVE I KNOW I CAN GET TO THE SAVE PART AND MANIPULATE THE FINAL FORMAT
np.savetxt('sorted_data.dat', sorted, fmt='%s', delimiter='\t') #Save the data
I STILL HAVE TO FIGURE HOW TO ADD THE OTHER COLUMNS - I AM WORKING ON THAT TOO

Related

Python & Pandas: appending data to new column

With Python and Pandas, I'm writing a script that passes text data from a csv through the pylanguagetool library to calculate the number of grammatical errors in a text. The script successfully runs, but appends the data to the end of the csv instead of to a new column.
The structure of the csv is:
The working code is:
import pandas as pd
from pylanguagetool import api
df = pd.read_csv("Streamlit\stack.csv")
text_data = df["text"].fillna('')
length1 = len(text_data)
for i, x in enumerate(range(length1)):
# this is the pylanguagetool operation
errors = api.check(text_data, api_url='https://languagetool.org/api/v2/', lang='en-US')
result = str(errors)
# this pulls the error count "message" from the pylanguagetool json
error_count = result.count("message")
output_df = pd.DataFrame({"error_count": [error_count]})
output_df.to_csv("Streamlit\stack.csv", mode="a", header=(i == 0), index=False)
The output is:
Expected output:
What changes are necessary to append the output like this?
Instead of using a loop, you might consider lambda which would accomplish what you want in one line:
df["error_count"] = df["text"].fillna("").apply(lambda x: len(api.check(x, api_url='https://languagetool.org/api/v2/', lang='en-US')["matches"]))
>>> df
user_id ... error_count
0 10 ... 2
1 11 ... 0
2 12 ... 0
3 13 ... 0
4 14 ... 0
5 15 ... 2
Edit:
You can write the above to a .csv file with:
df.to_csv("Streamlit\stack.csv", index=False)
You don't want to use mode="a" as that opens the file in append mode whereas you want (the default) write mode.
My strategy would be to keep the error counts in a list then create a separate column in the original database and finally write that database to csv:
text_data = df["text"].fillna('')
length1 = len(text_data)
error_count_lst = []
for i, x in enumerate(range(length1)):
errors = api.check(text_data, api_url='https://languagetool.org/api/v2/', lang='en-US')
result = str(errors)
error_count = result.count("message")
error_count_lst.append(error_count)
text_data['error_count'] = error_count_lst
text_data.to_csv('file.csv', index=False)

Reading from a .dat file Dataframe in Python

I have a .dat file which looks something like the below....
#| step | Channel| Mode | Duration|Freq.| Amplitude | Phase|
0 1 AWG Pi/2 100 2 1
1 1 SIN^2 100 1 1
2 1 SIN^2 200 0.5 1
3 1 REC 50 100 1 1
100 0 REC Pi/2 150 1 1
I had created a data frame and I wanted to read extract data from the data frame but I have an error
TypeError: expected str, bytes or os.PathLike object, not DataFrame
My code is below here,
import pandas as pd
import numpy as np
path = "updated.dat"
datContent = [i.strip().split() for i in open(path).readlines()]
#print(datContent)
column_names = datContent.pop(0)
print(column_names)
df = pd.DataFrame(datContent)
print(df)
extract_column = df.iloc[:,2]
with open (df, 'r') as openfile :
for line in openfile:
for column_search in line:
column_search = df.iloc[:,2]
if "REC" in column_search:
print ("Rec found")
Any suggestion would be appreciated
Since your post does not have any clear question, I have to guess based on your code. I am assuming that what you want to get is to find all rows in DataFrame where column Mode contains value REC.
Based on that, I prepared a small, self contained example that works on your data.
In your situation, the only line that you should use is the last one. Assuming that your DataFrame is created and filled correctly, your code below print(df) can be exchanged by this single line.
I would really recommend you reading the official documentation about indexing and selecting data from DataFrames. https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html
import pandas as pd
from io import StringIO
data = StringIO("""
no;step;Channel;Mode;Duration;Freq.;Amplitude;Phase
;0;1;AWG;Pi/2;100;2;1
;1;1;SIN^2;;100;1;1
;2;1;SIN^2;;200;0.5;1
;3;1;REC;50;100;1;1
;100;0;REC;Pi/2;150;1;1
""")
df = pd.read_csv(data, sep=";")
df.loc[df.loc[:, 'Mode'] == "REC", :]

Exporting max values of different csv files in to one

I got 3 datasets which contain the flow in m3/s per location. Dataset 1 is a 5 year ARI flood, Dataset 2 is a 20 year ARI flood and Dataset 3 is a 50 year ARI flood.
Per location I found the maximum discharge (5,20 & 50)
Code:
for key in Data_5_ARI_RunID_Flow_New.keys():
m = key
y5F_RunID = Data_5_ARI_RunID_Flow_New.loc[:,m]
y20F_RunID = Data_20_ARI_RunID_Flow_New.loc[:,m]
y50F_RunID = Data_50_ARI_RunID_Flow_New.loc[:,m]
max_y5F = max(y5F_RunID)
max_y20F = max(y20F_RunID)
max_y50F = max(y50F_RunID)
Max_DataID = m, max_y5F, max_y20F, max_y50F
print (Max_DataID)
The output is like this:
('G60_18', 44.0514, 47.625, 56.1275)
('Area5_11', 1028.4065, 1191.5946, 1475.9685)
('Area5_12', 1017.8286, 1139.2628, 1424.4304)
('Area5_13', 994.5626, 1220.0084, 1501.1483)
('Area5_14', 995.9636, 1191.8066, 1517.4541)
Now I want to export this result to a csv file, but I don't know how. I used this line of code, but it didn't work:
Max_DataID.to_csv(r'C:\Users\Max_DataID.csv', sep=',', index = False)
Use this file name myexample.csv with specific path where you want to create the file.
Please check that Max_DataID is a iterable value. And as your reference the values are in form of tuple so I use list() to convert tuples into list and that will be supported values for writerow in csv.
import csv
file = open('myexample.csv', 'wb')
filewriter = csv.writer(file,delimiter =',')
for data in Max_DataID:
filewriter.writerow(list(data))
You can do the following.
df.to_csv(file_name, sep='\t')
Also, if you want to split it into chunks, like 10,000 rows, or whatever, you can do this.
import pandas as pd
for i,chunk in enumerate(pd.read_csv('C:/your_path_here/main.csv', chunksize=10000)):
chunk.to_csv('chunk{}.csv'.format(i))

Improve python code in terms of speed

I have a very big file (1.5 billion lines) in the following form:
1 67108547 67109226 gene1$transcript1 0 + 1 0
1 67108547 67109226 gene1$transcript1 0 + 2 1
1 67108547 67109226 gene1$transcript1 0 + 3 3
1 67108547 67109226 gene1$transcript1 0 + 4 4
.
.
.
1 33547109 33557650 gene2$transcript1 0 + 239 2
1 33547109 33557650 gene2$transcript1 0 + 240 0
.
.
.
1 69109226 69109999 gene1$transcript1 0 + 351 1
1 69109226 69109999 gene1$transcript1 0 + 352 0
What I want to do is to reorganize/sort this file based on the identifier on column 4. The file is consisted of blocks. If you concatenate columns 4,1,2 and 3 you create the unique identifier for each block. This is the key for the dicionary all_exons and the value is a numpy array containing all the values of column 8. Then I have a second dictionary unique_identifiers that has as key the attributes from column 4 and values a list of the corresponding block identifiers. As output I write a file in the following form:
>gene1
0
1
3
4
1
0
>gene2
2
0
I already wrote some code (see below) that does this, but my implementation is very slow. It takes around 18 hours to run.
import os
import sys
import time
from contextlib import contextmanager
import pandas as pd
import numpy as np
def parse_blocks(bedtools_file):
unique_identifiers = {} # Dictionary with key: gene, value: list of exons
all_exons = {} # Dictionary contatining all exons
# Parse file and ...
with open(bedtools_file) as fp:
sp_line = []
for line in fp:
sp_line = line.strip().split("\t")
current_id = sp_line[3].split("$")[0]
identifier="$".join([sp_line[3],sp_line[0],sp_line[1],sp_line[2]])
if(identifier in all_exons):
item = float(sp_line[7])
all_exons[identifier]=np.append(all_exons[identifier],item)
else:
all_exons[identifier] = np.array([sp_line[7]],float)
if(current_id in unique_identifiers):
unique_identifiers[current_id].add(identifier)
else:
unique_identifiers[current_id] =set([identifier])
return unique_identifiers, all_exons
identifiers, introns = parse_blocks(options.bed)
w = open(options.out, 'w')
for gene in sorted(list(identifiers)):
w.write(">"+str(gene)+"\n")
for intron in sorted(list(identifiers[gene])):
for base in introns[intron]:
w.write(str(base)+"\n")
w.close()
How can I impove the above code in order to run faster?
You also import pandas, therefore, I provide a pandas solution which requires basically only two lines of code.
However, I do not know how it performs on large data sets and whether that is faster than your approach (but I am pretty sure it is).
In the example below, the data you provide is stored in table.txt. I then use groupby to get all the values in your 8th column, store them in a list for the respective identifier in your column 4 (note that my indices start at 0) and convert this data structure into a dictionary which can then be printed easily.
import pandas as pd
df=pd.read_csv("table.txt", header=None, sep = r"\s+") # replace the separator by e.g. '/t'
op = dict(df.groupby(3)[7].apply(lambda x: x.tolist()))
So in this case op looks like this:
{'gene1$transcript1': [0, 1, 3, 4, 1, 0], 'gene2$transcript1': [2, 0]}
Now you could print the output like this and pipeline it in a certain file:
for k,v in op.iteritems():
print k.split('$')[0]
for val in v:
print val
This gives you the desired output:
gene1
0
1
3
4
1
0
gene2
2
0
Maybe you can give it a try and let me know how it compares to your solution!?
Edit2:
In the comments you mentioned that you would like to print the genes in the correct order. You can do this as follows:
# add some fake genes to op from above
op['gene0$stuff'] = [7,9]
op['gene4$stuff'] = [5,9]
# print using 'sorted'
for k,v in sorted(op.iteritems()):
print k.split('$')[0]
for val in v:
print val
which gives you:
gene0
7
9
gene1
0
1
3
4
1
0
gene2
2
0
gene4
5
9
EDIT1:
I am not sure whether duplicates are intended but you could easily get rid of them by doing the following:
op2 = dict(df.groupby(3)[7].apply(lambda x: set(x)))
Now op2 would look like this:
{'gene1$transcript1': {0, 1, 3, 4}, 'gene2$transcript1': {0, 2}}
You print the output as before:
for k,v in op2.iteritems():
print k.split('$')[0]
for val in v:
print val
which gives you
gene1
0
1
3
4
gene2
0
2
I'll try to simplify your question, my solution is like this:
First, scan over the big file. For every different current_id, open a temporary file and append value of column 8 to that file.
After the full scan, catenate all chunks to a result file.
Here's the code:
# -*- coding: utf-8 -*-
import os
import tempfile
import subprocess
class ChunkBoss(object):
"""Boss for file chunks"""
def __init__(self):
self.opened_files = {}
def write_chunk(self, current_id, value):
if current_id not in self.opened_files:
self.opened_files[current_id] = open(tempfile.mktemp(), 'wb')
self.opened_files[current_id].write('>%s\n' % current_id)
self.opened_files[current_id].write('%s\n' % value)
def cat_result(self, filename):
"""Catenate chunks to one big file
"""
# Sort the chunks
chunk_file_list = []
for current_id in sorted(self.opened_files.keys()):
chunk_file_list.append(self.opened_files[current_id].name)
# Flush chunks
[chunk.flush() for chunk in self.opened_files.values()]
# By calling cat command
with open(filename, 'wb') as fp:
subprocess.call(['cat', ] + chunk_file_list, stdout=fp, stderr=fp)
def clean_up(self):
[os.unlink(chunk.name) for chunk in self.opened_files.values()]
def main():
boss = ChunkBoss()
with open('bigfile.data') as fp:
for line in fp:
data = line.strip().split()
current_id = data[3].split("$")[0]
value = data[7]
# Write value to temp chunk
boss.write_chunk(current_id, value)
boss.cat_result('result.txt')
boss.clean_up()
if __name__ == '__main__':
main()
I tested the performance of my script, with bigfile.data containing about 150k lines. It took about 0.5s to finish on my laptop. Maybe you can give it a try.

Convert large csv to hdf5

I have a 100M line csv file (actually many separate csv files) totaling 84GB. I need to convert it to a HDF5 file with a single float dataset. I used h5py in testing without any problems, but now I can't do the final dataset without running out of memory.
How can I write to HDF5 without having to store the whole dataset in memory? I'm expecting actual code here, because it should be quite simple.
I was just looking into pytables, but it doesn't look like the array class (which corresponds to a HDF5 dataset) can be written to iteratively. Similarly, pandas has read_csv and to_hdf methods in its io_tools, but I can't load the whole dataset at one time so that won't work. Perhaps you can help me solve the problem correctly with other tools in pytables or pandas.
Use append=True in the call to to_hdf:
import numpy as np
import pandas as pd
filename = '/tmp/test.h5'
df = pd.DataFrame(np.arange(10).reshape((5,2)), columns=['A', 'B'])
print(df)
# A B
# 0 0 1
# 1 2 3
# 2 4 5
# 3 6 7
# 4 8 9
# Save to HDF5
df.to_hdf(filename, 'data', mode='w', format='table')
del df # allow df to be garbage collected
# Append more data
df2 = pd.DataFrame(np.arange(10).reshape((5,2))*10, columns=['A', 'B'])
df2.to_hdf(filename, 'data', append=True)
print(pd.read_hdf(filename, 'data'))
yields
A B
0 0 1
1 2 3
2 4 5
3 6 7
4 8 9
0 0 10
1 20 30
2 40 50
3 60 70
4 80 90
Note that you need to use format='table' in the first call to df.to_hdf to make the table appendable. Otherwise, the format is 'fixed' by default, which is faster for reading and writing, but creates a table which can not be appended to.
Thus, you can process each CSV one at a time, use append=True to build the hdf5 file. Then overwrite the DataFrame or use del df to allow the old DataFrame to be garbage collected.
Alternatively, instead of calling df.to_hdf, you could append to a HDFStore:
import numpy as np
import pandas as pd
filename = '/tmp/test.h5'
store = pd.HDFStore(filename)
for i in range(2):
df = pd.DataFrame(np.arange(10).reshape((5,2)) * 10**i, columns=['A', 'B'])
store.append('data', df)
store.close()
store = pd.HDFStore(filename)
data = store['data']
print(data)
store.close()
yields
A B
0 0 1
1 2 3
2 4 5
3 6 7
4 8 9
0 0 10
1 20 30
2 40 50
3 60 70
4 80 90
This should be possible with PyTables. You'll need to use the EArray class though.
As an example, the following is a script I wrote to import chunked training data stored as .npy files into a single .h5 file.
import numpy
import tables
import os
training_data = tables.open_file('nn_training.h5', mode='w')
a = tables.Float64Atom()
bl_filter = tables.Filters(5, 'blosc') # fast compressor at a moderate setting
training_input = training_data.create_earray(training_data.root, 'X', a,
(0, 1323), 'Training Input',
bl_filter, 4000000)
training_output = training_data.create_earray(training_data.root, 'Y', a,
(0, 27), 'Training Output',
bl_filter, 4000000)
for filename in os.listdir('input'):
print "loading {}...".format(filename)
a = numpy.load(os.path.join('input', filename))
print "writing to h5"
training_input.append(a)
for filename in os.listdir('output'):
print "loading {}...".format(filename)
training_output.append(numpy.load(os.path.join('output', filename)))
Take a look at the docs for detailed instructions, but very briefly, the create_earray function takes 1) a data root or parent node; 2) an array name; 3) a datatype atom; 4) a shape with a 0 in the dimension you want to expand; 5) a verbose descriptor; 6) a compression filter; and 7) an expected number of rows along the expandable dimension. Only the first two are required, but you'll probably use all seven in practice. The function accepts a few other optional arguments as well; again, see the docs for details.
Once the array is created, you can use its append method in the expected way.
If you have a very large single CSV file, you may want to stream the conversion to hdf, e.g.:
import numpy as np
import pandas as pd
from IPython.display import clear_output
CHUNK_SIZE = 5000000
filename = 'data.csv'
dtypes = {'latitude': float, 'longitude': float}
iter_csv = pd.read_csv(
filename, iterator=True,
dtype=dtypes, encoding='utf-8', chunksize=CHUNK_SIZE)
cnt = 0
for ix, chunk in enumerate(iter_csv):
chunk.to_hdf(
"data.hdf", 'data', format='table', append=True)
cnt += CHUNK_SIZE
clear_output(wait=True)
print(f"Processed {cnt:,.0f} coordinates..")
Tested with a 64GB CSV file and 450 Million coordinates (about 10 Minutes conversion).

Categories

Resources