pandas custom file format parsing - python

I have data in the following format:
1_engineer_grade1 |Boolean IsMale IsNorthAmerican IsFromUSA |Name blah
2_lawyer_grade7 |Boolean IsFemale IsAlive |Children 2
I need to convert this into a dataframe with the following columns:
id job grade Bool.IsMale Bool.IsFemale Bool.IsAlive Bool.IsNorthAmerican Bool.IsFromUSA Name Children
1 engineer 1 True False False True True blah NaN
2 lawyer 7 False True True True False NaN 2
I could preprocess this data in python and then call pd.DataFrame on this, but I was wondering if there was a better way of doing this?
UPDATE: I ended up doing the following: If there are obvious optimizations, please let me know
with open(vwfile, encoding='latin-1') as f:
data = []
for line in f:
line = [x.strip() for x in line.strip().split('|')]
# line == [
# "1_engineer_grade1",
# "|Boolean IsMale IsNorthAmerican IsFromUSA",
# "|Name blah"
# ]
ident, job, grade = line[0].split("_")
features = line[1:]
bools = {
"IsMale": False,
"IsFemale": False,
"IsNorthAmerican": False,
"IsFromUSA": False,
"IsAlive": False,
}
others = {}
for category in features:
if category.startswith("Bools "):
for feature in category.split(' ')[1:]:
bools[feature] = True
else:
feature = category.split(" ")
# feature == ["Name", "blah"]
others[feature[0]] = feature[1]
featuredict = {
'ident': ident,
'job': job,
'grade': grade,
}
featuredict.update(bools)
featuredict.update(others)
data.append(featuredict)
df = pd.DataFrame(data)
UPDATE-2 A million line file took about 55 seconds to process this.

Related

create new dataframe field using lambda function

I am trying to create a new column based on conditions on other column.
(the data frame is already aggragated by user)
this is a sample of the data frame:
event_names country
["deleteobject", "getobject"] ["us"]
["getobject"] ["ca"]
["deleteobject", "putobject"] ["ch"]
I want to create 3 new columns:
was data deleted?
was data downloaded?
did the events come from my whitelisted countries?
WHITELISTED_COUNTRIES = ["us", "sg"]
like this:
event_names country was_data_deleted? was_data_downloaded? whitelisted_country?
["deleteobject","getobject"] ["us"] True True True
["getobject"] ["ca"] False True False
["deleteobject","putobject"] ["ch"] True False False
This is what I tried so far:
result_df['was_data_deleted'] = result_df['event_name'].apply(lambda x:True if any("delete" in x for i in x) else False)
result_df['was_data_downloaded'] = result_df['event_name'].apply(lambda x:True if "getObject" in i for i in x else False)
result_df['strange_countries'] = result_df['country'].apply(lambda x:False if any(x in WHITELISTED_COUNTRIES for x in result_df['country']) else False)
I get an Error "SyntaxError: invalid syntax"
any ideas? thanks!
df['was_data_deleted'] = df['event_names'].apply(lambda x: 'deleteobject' in x)
df['was_data_downloaded'] = df['event_names'].apply(lambda x: 'getobject' in x)
df['whitelisted_country'] = df['country'].apply(lambda x: x[0] in WHITELISTED_COUNTRIES)
print(df)
Prints:
event_names country was_data_deleted was_data_downloaded whitelisted_country
0 [deleteobject, getobject] [us] True True True
1 [getobject] [ca] False True False
2 [deleteobject, putobject] [ch] True False False
You can simplify your lambda function with remove if-else and True, False, because compared values already return it:
WHITELISTED_COUNTRIES = ["us", "sg"]
#checked substring delete
f1 = lambda x: any("delete" in i for i in x)
result_df['was_data_deleted'] = result_df['event_names'].apply(f1)
#checked string "getobject"
f2 = lambda x:"getobject" in x
result_df['was_data_downloaded'] = result_df['event_names'].apply(f2)
#checked list
f3 = lambda x:any(y in WHITELISTED_COUNTRIES for y in x)
result_df['strange_countries'] = result_df['country'].apply(f3)
print (result_df)
event_names country was_data_deleted was_data_downloaded \
0 [deleteobject, getobject] [us] True True
1 [getobject] [ca] False True
2 [deleteobject, putobject] [ch] True False
strange_countries
0 True
1 False
2 False

Python 'key error' while building dictionary dynamically (On the fly)

I get the error onthis line of code -
result_dict['strat'][k]['name'] = current_comps[0].strip()
The error is : Keyerror: 'strat'
I have an input line
PERSON1 ## CAR1 # ENTRY : 0 | EXIT : 0 ## CAR2 # M1 : YES : 10/01/17 02:00 | M2 : NO : 10/02/16 03:00 | M3 : NO : 05/07/17 11:00 | M4 : YES : 01/01/16 03:00 ## TRUCK # M3 : NO : 03/01/17 03:45 | M23 : NO : 01/01/14 07:00 | M27 : YES : 02/006/18 23:00
I 'm looking to parse this input to generate the output detailed below. As part of this, I'm trying to build a dictionary inserting both keys & values dynamically. I'm having a lot of problems doing this.
Could I please request help on this?
Here is what I've tried so far -
# File read
f = open('input_data', 'r')
file_cont = f.read().splitlines()
f.close()
#json template
# Initialize dictionary
result_arr = []
result_dict = {}
k = 0
for item in file_cont:
strat = item.split('##')
result_dict['Person'] = strat[0].strip()
j = 1
while j < len(strat):
# Split various components of the main line
current_comps = strat[j].split('#')
# Name of strat being parsed
result_dict['strat'][k]['name'] = current_comps[0].strip()
# tfs across the various time frames
tfs = current_comps[1].split('|')
# First travel mode
if current_comps[0].strip() == 'CAR1':
temp_low_arr = tfs[0].split(':')
temp_high_arr = tfs[1].split(':')
result_dict['strat'][k]['Entry'] = temp_low_arr[1].strip()
result_dict['strat'][k]['Exit'] = temp_high_arr[1].strip()
# Second travel mode
elif current_comps[0].strip() == 'CAR2':
z = 0
while z < len(tfs):
# Split components of the sign
sign_comp_car_2 = tfs[z].split(':')
result_dict['strat'][k]['tf'][z]['path'] = sign_comp_ma_cross[0].strip()
result_dict['strat'][k]['tf'][z]['sign'] = sign_comp_ma_cross[1].strip()
result_dict['strat'][k]['tf'][z]['sign_time'] = sign_comp_ma_cross[2].strip()
z += 1
# Third travel mode
elif current_comps[0].strip() == 'CAR3':
b = 0
while b < len(tfs):
# Split components of the sign
sign_car_3 = tfs[z].split(':')
result_dict['strat'][k]['tf'][b]['path'] = sign_all_term[0].strip()
result_dict['strat'][k]['tf'][b]['sign'] = sign_all_term[1].strip()
result_dict['strat'][k]['tf'][b]['sign_time'] = sign_all_term[2].strip()
b += 1
j += 1
k += 1
Expected output
[{
"Person":"",
"Transport":[
{
"Name":"CAR1",
"Entry":"0",
"Exit":"0"
},
{
"name":"CAR2:",
"tf":[
{
"path":"M1",
"sign":"YES",
"sign_time":"10/01/17 02:00"
},
{
"path":"M2",
"sign":"NO",
"sign_time":"10/02/16 03:00"
},
{
"path":"M3",
"sign":"NO",
"sign_time":"05/07/17 11:00"
},
{
"path":"M4",
"sign":"YES",
"sign_time":"01/01/16 03:00"
}
]
},
{
"name":"CAR3",
"tf":[
{
"path":"M3",
"sign":"NO",
"sign_time":"03/01/17 03:45"
},
{
"path":"M23",
"sign":"NO",
"sign_time":"01/01/14 07:00"
},
{
"path":"M27",
"sign":"Yes",
"sign_time":"02/006/18 23:00"
}
]
}
]
}]
The issue is when you try to assign the ['name'] field in result_dict['strat'][k] when result_dict['strat'][k] hasn't been initialized yet. Before you run your for-loop, the dictionary has no key called strat.
Now you could have done something like result_dict['strat'] = dict() (assigning an object to that key in the dict), but when you further subscript it using result_dict['strat'][k], it will try to resolve that first, by accessing result_dict['strat'], expecting either a subscriptable collection or a dictionary in return. However, since that key doesn't exist yet, it throws you the error.
What you could do instead is initialize a default dictionary:
from collections import defaultdict
...
resultdict = defaultdict(dict)
...
Otherwise, in your existing code, you could initialize a dict within result_dict before entering the loop.

How to search for multiple data from multiple lines and store them in dictionary?

Say I have a file with the following:
/* Full name: abc */
.....
.....(.....)
.....(".....) ;
/* .....
/* .....
..... : "....."
}
"....., .....
Car : true ;
House : true ;
....
....
Age : 33
....
/* Full name: xyz */
....
....
Car : true ;
....
....
Age : 56
....
I am only interested in full name, car, house and age of each person. There are many other lines of data with different format between the variable/attritbute that I am interested.
My code so far:
import re
initial_val = {'House': 'false', 'Car': 'false'}
with open('input.txt') as f:
records = []
current_record = None
for line in f:
if not line.strip():
continue
elif current_record is None:
people_name = re.search('.+Full name ?: (.+) ', line)
if people_name:
current_record = dict(initial_val, Name = people_name.group(1))
else:
continue
elif current_record is not None:
house = re.search(' *(House) ?: ?([a-z]+)', line)
if house:
current_record['House'] = house.group(2)
car = re.search(' *(Car) ?: ?([a-z]+)', line)
if car:
current_record['Car'] = car.group(2)
people_name = re.search('.+Full name ?: (.+) ', line)
if people_name:
records.append(current_record)
current_record = dict(initial_val, Name = people_name.group(1))
print records
What I get:
[{'Name': 'abc', 'House': 'true', 'Car': 'true'}]
My question:
How am I suppose to extract the data and store it in a dictionary like:
{'abc': {'Car': true, 'House': true, 'Age': 33}, 'xyz':{'Car': true, 'House': false, 'Age': 56}}
My purpose:
check whether each person has car, house and age, if no then return false
The I could print them in a table like this:
Name Car House Age
abc true true 33
xyz true false 56
Note that I am using Python 2.7 and I do not know what is the actual value of each variable/attribute (Eg. abc, true, true, 33) of each person.
What is the best solution to my question? Thanks.
Well, you just have to keep track of the current record:
def parse_name(line):
# first remove the initial '/* ' and final ' */'
stripped_line = line.strip('/* ')
return stripped_line.split(':')[-1]
WANTED_KEYS = ('Car', 'Age', 'House')
# default values for when the lines are not present for a record
INITIAL_VAL = {'Car': False, 'House': False, Age: -1}
with open('the_filename') as f:
records = []
current_record = None
for line in f:
if not line.strip():
# skip empty lines
continue
elif current_record is None:
# first record in the file
if line.startswith('/*'):
current_record = dict(INITIAL_VAL, name=parse_name(line))
else:
# this should probably be an error in the file contents
continue
elif line.startswith('/*'):
# this means that the current record finished, and a new one is starting
records.append(current_record)
current_record = dict(INITIAL_VAL, name=parse_name(line))
else:
key, val = line.split(':')
if key.strip() in WANTED_KEYS:
# we want to keep track of this field
current_record[key.strip()] = val.strip()
# otherwise just ignore the line
print('Name\tCar\tHouse\tAge')
for record in records:
print(record['name'], record['Car'], record['House'], record['Age'], sep='\t')
Note that for Age you may want to convert it to an integer using int:
if key == 'Age':
current_record['Age'] = int(val)
The above code produces a list of dictionaries, but it is easy enough to convert it to a dictionary of dicts:
new_records = {r['name']: dict(r) for r in records}
for val in new_records.values():
del val['name']
After this new_records will be something like:
{'abc': {'Car': True, 'House': True, Age: 20}, ...}
If you have other lines with a different format in between the interesting ones you can simply write a function that returns True or False depending on whether the line is in the format you require and use it to filter the lines of the file:
def is_interesting_line(line):
if line.startswith('/*'):
return True
elif ':' in line:
return True
for line in filter(is_interesting_line, f):
# code as before
Change is_interesting_line to suit your needs. In the end, if you have to handle several different formats etc. maybe using a regex would be better, in that case you could do something like:
import re
LINE_REGEX = re.compile(r'(/\*.*\*/)|(\w+\s*:.*)| <other stuff>')
def is_interesting_line(line):
return LINE_REGEX.match(line) is not None
If you want you can obtain fancier formatting for the table, but you probably first need to determine the maximum length of the name etc. or you can use something like tabulate to do that for you.
For example something like (not tested):
max_name_length = max(max(len(r['name']) for r in records), 4)
format_string = '{:<{}}\t{:<{}}\t{}\t{}'
print(format_string.format('Name', max_name_length, 'Car', 5, 'House', 'Age'))
for record in records:
print(format_string.format(record['name'], max_name_length, record['Car'], 5, record['House'], record['Age']))

How do I get Python to recognize blank/empty cells in a CSV file [duplicate]

I have a csv file with 5 columns:
1, 2312, "A", , 20
2, 8383, "B", "UK",
3, 3883, , , 45
where the columns represent id, customerId, customerName, customerAddress and customerAge.
I want to put 0 at the place where the age is blank and '' where the other string type attributes are blank. But I can't identify the blank field in python. I have tried doing some things like:
len(row[4]) == 0
row[4] == ''
row[4] == None
repr(row[4]) == ''
but it didn't work. What am I doing wrong?
you want to use not
0, None, False , '' are all not True
if not row[4]:
you could also do
bool(row[4])
which will return False for all the above mentioned values

Doing operations on a large data set

I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.
I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me

Categories

Resources