Dataframe to Time Series when minutes are repeated - python

I'm working with clinical data and want to make predictions of patients' waiting time at every minute, and the data (simplified) looks something like this:
Time(minutes) PatientSerial RemainingTime(minutes)
420 1 5
420 2 10
420 3 8
421 1 4
421 2 9
421 3 7
Where 420 is the number of minutes since midnight (420 = 7:00am), where my output is RemainingTime (historical data). In general, the machine learning algorithm should generate waiting time of every patient in each minute, given that the input is clinical data that is generated every minute. But I'm confused as to how to convert this dataframe into Time Series when the same minutes are repeated?

For clarity: This is not an answer but asking how the result should look like (not able to show the view in comment underneath question). This may help to get a better understanding how this question should be solved. Edit1: Coded answer is below.
#Ted:
I would like to know if the result what you try to get is as in the following table:
Time (min) MeanWait (min, single default patient)
... ...
420 5.2
421 4.9
422 4.3
423 4.2
...
...
820 11.39
821 11.41
822 11.41
823 11.09
824 10.7
825 10.69
... ...
Should the end-result be viewed in PDF using Matplotlib or in program GUI on screen? If so modify your question to include that.
EDIT 1:
Based on the comments I've made below script that does the core job "calculating" mean patient-waitingtime per unit daytime (minutes). Inline there are comments what happens where. As I reckon you can implement filedata loading and output writing yourself I did not add that as for the matplotlib. There are many examples here and around the web that suffice.
import datetime
# The day timescale is from 0 to 1440 minutes and then resets for day 2.
# The input-textfile can have 24h (0-1440) or continues (e.g. 0-4320 == 3 days) timescaling for x-axis.
# Testset for dataprocessing (day 1 and day2 data)
datas = ['Time(minutes) RemainingTime(minutes)',
'420 : 5',
'420 : 10',
'420 : 8',
'421 : 4',
'421 : 9',
'421 : 7',
'830 : 8',
'830 : 4',
'340 : 3',
'340 : 5',
'340 : 4',
'351 : 10',
'351 : 7',
'420 : 9',
'420 : 7',]
def sort_data(scr):
raw_data = {}
day_minute_counter = 0
current_list = []
day_in_minutes = (24 * 60)
elapsed_days_min = 0 # during processing this holds value in minutes
processed_days = 1
data_from_exception = {}
count_exceptions = 0
for row in scr:
print row
try:
# the following steps take ito account that lapsed time is linear for a single day.
# each row is being searched for ":" which identifies teh row as having integers or floats.
x_value, y_value = row.split(":")
# print 'xy_values : %s, %s' % (x_value, y_value)
# clipping trailing whitespaces from both ends.
x_value = x_value.strip(' ')
y_value = y_value.strip(' ')
# string > integer conversion
x_val = int(x_value)
y_val = int(y_value)
# set each x-axis timepoint only once.
if day_minute_counter == 0:
print 'Start', day_minute_counter, x_val
day_minute_counter = x_val
# zipping: append all y-axis datapoint that belong to single x-axispoint
if day_minute_counter == x_val:
print 'Append', day_minute_counter, x_val
current_list.append(y_val)
# add x,y-axis data to the datalist
if day_minute_counter < x_val:
print 'Done', day_minute_counter, x_val, current_list
raw_data[(day_minute_counter + elapsed_days_min)] = current_list
day_minute_counter = x_val
# new list for the next point in the "day_minute_counter".
current_list = []
current_list.append(y_val)
# correct x-axis "next-day" time difference.
if day_minute_counter > x_val:
processed_days += 1
print 'Next Day Marker', day_minute_counter, x_val, current_list
raw_data[(day_minute_counter+ elapsed_days_min)] = current_list
elapsed_days_min += day_in_minutes
# reset day_minute_counter because a day has elapsed.
day_minute_counter = 0
print 'elapsed_day in minutes : ', elapsed_days_min
except ValueError:
#get axis information
count_exceptions += 1
data_from_exception[count_exceptions] = row
# print 'Graph info or "none integer" information collected:\n\n%s > %s\n' % (count_exceptions, row)
# End of datablock : add the last x,y datapoints without known what EOF marker is being used.
raw_data[day_minute_counter] = current_list
print '\nRaw Data : %s\nOther info : %s\n ' % (raw_data, data_from_exception)
return (raw_data, processed_days, data_from_exception)
def calc_mean(scr):
days = scr[1]
minutes = (days * 24 * 60)
missing_datapoints = []
result = []
print 'Dataset spans a total of "%s" minutes.\n' % minutes
data = scr[0]
for x_datapoint in range(1, minutes):
meanwait = 0.0
totalwait = 0
try:
# process data from sorte_data.
# print 'datapoint', x_datapoint # shows only the absent datapoints on x-axis.
dataset = data[x_datapoint]
# print 'datapoint', x_datapoint # shows only the available datapoints on x-axis.
total_values = len(dataset)
for value in dataset:
totalwait += value
meanwait = float(totalwait) / float(total_values)
x = x_datapoint
y = meanwait
result.append((x, y))
print 'Patient meanwaiting time per Timepoint %s : %.03f' % (x_datapoint, meanwait)
except Exception:
missing_datapoints.append(x_datapoint)
# print 'Patient meainwaiting time "%s" is not available.' % x_datapoint
return result
def main():
# open file code here and use readlines to import data to "datas"
#
# datas = ...
ct = str(datetime.datetime.now())[0:23]
print '%s --> Collecting patient waittime data from Time Series.\n' % ct
sorted_data = sort_data(datas) # used template date from this script.
print 'Processing data to obtain main values'
the_result = calc_mean(sorted_data)
print '\nProcessing Finished. Here is the result :\n\n%s' % the_result
# create new file and store result or keep processing to PDF in matplotlib
if __name__ == '__main__':
main()

Related

Generating a nested dictionary in Python through iterations

I'm new in Python and I have to retrieve datas from a txt file (which I have already did) and then I need to make a nested dictionary like this:
new_dict = {"2009-10-16": {"KitchenSensor":"active for x minutes today",
"BathroomSensor":"active for y minutes today"...}
"2009-10-24":{"KitchenSensor":"active for x minutes today",
"BathroomSensor":"active for y minutes today"...}
"2009-11-13":{"KitchenSensor":"active for x minutes today",
"BathroomSensor":"active for y minutes today"...}}
my code looks like this
namesFile = open("data.txt", "r")
listaDati = namesFile.readlines()
listaDivisa = []
for i in listaDati:
if i[27] != "T":
listaDivisa.append(
i.split())
and the datas in my txt file have this format:
2009-10-16 00:01:04.000059 KITCHENSENSOR ON
2009-10-16 02:33:12.000093 KITCHENSENSOR OFF
2009-10-24 21:25:52.000023 BATHROOMSENSOR ON
2009-10-24 23:13:52.000014 BATHROOMSENSOR OFF
2009-11-13 09:03:23.000053 BATHROOMSENSOR ON
2009-11-13 12:13:42.000014 BATHROOMSENSOR OFF
the timestamp changes every now and then so I want to create a new key with the new timestamp everytime I meet a new one and saving the infos I have to save. I was trying doing this with an enumerative for loop but I don't understand how I can create the dictionary.
Thank you!
You're maybe looking for something like this; I separated the task into
parsing the input lines (could be from a file, but here they're just a list) into events (3-tuples of datetime, sensor name and state)
grouping the events by date, and looking at the state changes.
import datetime
from itertools import groupby
def parse_line(line):
# Split the line at the two spaces.
time_string, event = line.split(" ", 1)
# Split the rest of the line at one space.
sensor, event = event.split(" ", 1)
# Parse the time string to a real datetime object.
t = datetime.datetime.strptime(time_string, "%Y-%m-%d %H:%M:%S.%f")
return (t, sensor, event == "ON")
def collate_sorted_events(sorted_events):
zero_delta = datetime.timedelta(0)
for day, events in groupby(
sorted_events, lambda event_triple: event_triple[0].date()
):
# We're assuming all sensors start off each day.
turn_on_times = {}
durations = {}
for time, sensor, state in events:
if state: # Turning on?
# If it was on already, that's not an issue; we just consider that a glitch.
if sensor not in turn_on_times:
turn_on_times[sensor] = time
else:
if sensor not in turn_on_times:
raise ValueError("Sensor was turned off before it was turned on.")
this_duration = time - turn_on_times[sensor]
durations[sensor] = durations.get(sensor, zero_delta) + this_duration
del turn_on_times[sensor]
yield (day, durations)
if turn_on_times:
# This check could be removed, but for now it's a good sanity check...
raise ValueError(
"Some sensors were left on at the end of the day; this could be a problem"
)
listaDati = [
"2009-10-16 00:01:04.000059 KITCHENSENSOR ON",
"2009-10-16 02:33:12.000093 KITCHENSENSOR OFF",
"2009-10-24 21:25:52.000023 BATHROOMSENSOR ON",
"2009-10-24 23:13:52.000014 BATHROOMSENSOR OFF",
"2009-11-13 09:03:23.000053 BATHROOMSENSOR ON",
"2009-11-13 12:13:42.000014 BATHROOMSENSOR OFF",
]
# Parse and sort input lines. It's imperative that the events are sorted
# so the rest of the code works as it should.
sorted_events = sorted(parse_line(i) for i in listaDati)
# Collate events by day; the function yields day/durations tuples,
# and `dict` accepts that format to create a dict with.
output = dict(collate_sorted_events(sorted_events))
print(output)
for date, deltas in sorted(output.items()):
for sensor, delta in sorted(deltas.items()):
print(f"{date} {sensor} {delta.total_seconds() / 60:.2f} minutes")
The output is
{
datetime.date(2009, 10, 16): {'KITCHENSENSOR': datetime.timedelta(seconds=9128, microseconds=34)},
datetime.date(2009, 10, 24): {'BATHROOMSENSOR': datetime.timedelta(seconds=6479, microseconds=999991)},
datetime.date(2009, 11, 13): {'BATHROOMSENSOR': datetime.timedelta(seconds=11418, microseconds=999961)},
}
followed by the formatted
2009-10-16 KITCHENSENSOR 152.13 minutes
2009-10-24 BATHROOMSENSOR 108.00 minutes
2009-11-13 BATHROOMSENSOR 190.32 minutes

Check if the number of slots is > 0 before picking a date and an hour?

I am building a vaccination appointment program that automatically assigns a slot to the user.
This builds the table and saves it into a CSV file:
import pandas
start_date = '1/1/2022'
end_date = '31/12/2022'
list_of_date = pandas.date_range(start=start_date, end=end_date)
df = pandas.DataFrame(list_of_date)
df.columns = ['Date/Time']
df['8:00'] = 100
df['9:00'] = 100
df['10:00'] = 100
df['11:00'] = 100
df['12:00'] = 100
df['13:00'] = 100
df['14:00'] = 100
df['15:00'] = 100
df['16:00'] = 100
df['17:00'] = 100
df.to_csv(r'C:\Users\Ric\PycharmProjects\pythonProject\new.csv')
And this code randomly pick a date and an hour from that date in the CSV table we just created:
import pandas
import random
from random import randrange
#randrange randomly picks an index for date and time for the user
random_date = randrange(365)
random_hour = randrange(10)
list = ["8:00", "9:00", "10:00", "11:00", "12:00", "13:00", "14:00", "15:00", "16:00", "17:00"]
hour = random.choice(list)
df = pandas.read_csv('new.csv')
date=df.iloc[random_date][0]
# 1 is substracted from that cell as 1 slot will be assigned to the user
df.loc[random_date, hour] -= 1
df.to_csv(r'C:\Users\Ric\PycharmProjects\pythonProject\new.csv',index=False)
print(date)
print(hour)
I need help with making the program check if the random hour it chose on that date has vacant slots. I can manage the while loops that are needed if the number of vacant slots is 0. And no, I have not tried much because I have no clue of how to do this.
P.S. If you're going to try running the code, please remember to change the save and read location.
Here is how I would do it. I've also cleaned it up a bit.
import random
import pandas as pd
start_date, end_date = '1/1/2022', '31/12/2022'
hours = [f'{hour}:00' for hour in range(8, 18)]
df = pd.DataFrame(
data=pd.date_range(start_date, end_date),
columns=['Date/Time']
)
for hour in hours:
df[hour] = 100
# 1000 simulations
for _ in range(1000):
random_date, random_hour = random.randrange(365), random.choice(hours)
# Check if slot has vacant slot
if df.at[random_date, random_hour] > 0:
df.at[random_date, random_hour] -= 1
else:
# Pass here, but you can add whatever logic you want
# for instance you could give it the next free slot in the same day
pass
print(df.describe())
import pandas
import random
from random import randrange
# randrange randomly picks an index for date and time for the user
random_date = randrange(365)
# random_hour = randrange(10) #consider removing this line since it's not used
lista = [# consider avoid using Python preserved names
"8:00",
"9:00",
"10:00",
"11:00",
"12:00",
"13:00",
"14:00",
"15:00",
"16:00",
"17:00",
]
hour = random.choice(lista)
df = pandas.read_csv("new.csv")
date = df.iloc[random_date][0]
# 1 is substracted from that cell as 1 slot will be assigned to the user
if df.loc[random_date, hour] > 0:#here is what you asked for
df.loc[random_date, hour] -= 1
else:
print(f"No Vacant Slots in {random_date}, {hour}")
df.to_csv(r"new.csv", index=False)
print(date)
print(hour)
Here's another alternative. I'm not sure you really need the very large and slow-to-load pandas module for this. This does it with plan Python structures. I tried to run the simulation until it got a failure, but with 365,000 open slots, and flushing the database to disk each time, it takes too long. I changed the 100 to 8, just to see it find a dup in reasonable time.
import csv
import datetime
import random
def create():
start = datetime.date( 2022, 1, 1 )
oneday = datetime.timedelta(days=1)
headers = ["date"] + [f"{i}:00" for i in range(8,18)]
data = []
for _ in range(365):
data.append( [start.strftime("%Y-%m-%d")] + [8]*10 ) # not 100
start += oneday
write( headers, data )
def write(headers, rows):
fcsv = csv.writer(open('data.csv','w',newline=''))
fcsv.writerow( headers )
fcsv.writerows( rows )
def read():
days = []
headers = []
for row in csv.reader(open('data.csv')):
if not headers:
headers = row
else:
days.append( [row[0]] + list(map(int,row[1:])))
return headers, days
def choose( headers, days ):
random_date = random.randrange(365)
random_hour = random.randrange(len(headers)-1)+1
choice = days[random_date][0] + " " + headers[random_hour]
print( "Chose", choice )
if days[random_date][random_hour]:
days[random_date][random_hour] -= 1
write(headers,days)
return choice
else:
print("Randomly chosen slot is full.")
return None
create()
data = read()
while choose( *data ):
pass

How to create a function to tell whether a value is increasing or decreasing?

I want to create comments from a dataset that details the growth rate, market share, etc for various markets and products. The dataset is in the form of a pd.DataFrame(). I would like the comment to include keywords like increase/decrease based on the calculations, for example, if 2020 Jan has sale of 1000, and 2021 Jan has a sale of 1600, then it will necessary mean an increase of 60%.
I defined a function outside as such and I would like to seek if this method is too clumsy, if so, how should I improve on it.
GrowthIncDec = namedtuple('gr_tuple', ['annual_growth_rate', 'quarterly_growth_rate'])
def increase_decrease(annual_gr, quarter_gr):
if annual_gr > 0:
annual_growth_rate = 'increased'
elif annual_gr < 0:
annual_growth_rate = 'decreased'
else:
annual_growth_rate = 'stayed the same'
if quarter_gr > 0:
quarterly_growth_rate = 'increased'
elif quarter_gr < 0:
quarterly_growth_rate = 'decreased'
else:
quarterly_growth_rate = 'stayed the same'
gr_named_tuple = GrowthIncDec(annual_growth_rate=annual_growth_rate, quarterly_growth_rate=quarterly_growth_rate)
return gr_named_tuple
myfunc = increase_decrease(5, -1)
myfunc.annual_growth_rate
output: 'increased'
A snippet of my main code is as follows to illustrate the use of the above function:
def get_comments(grp, some_dict: Dict[str, List[str]]):
.......
try:
subdf = the dataframe
annual_gr = subdf['Annual_Growth'].values[0]
quarter_gr = subdf['Quarterly_Growth'].values[0]
inc_dec_named_tup = increase_decrease(annual_gr, quarter_gr)
inc_dec_annual_gr = inc_dec_named_tup.annual_growth_rate
inc_dec_quarterly_gr = inc_dec_named_tup.quarterly_growth_rate
comment = "The {} has {} by {:.1%} in {} {} compared to {} {}"\
.format(market, inc_dec_annual_gr, annual_gr, timeperiod, curr_date, timeperiod, prev_year)
comments_df = pd.DataFrame(columns=['Date','Comments'])
# comments_df['Date'] = [curr_date]
comments_df['Comments'] = [comment]
return comments_df
except (IndexError, KeyError) as e:
# this is for all those nan values which is empty
annual_gr = 0
quarter_gr = 0

Python 3 verification script not checking properly

I've been working on a python script and am having issues with some verification's I set up. I have this procedure file that has a function that uses a order number and a customer number to check some past history about the customers orders. Ive been testing live on our server and I keep failing the last if statement. The order number and customer number Im using does have more than one order and some are over 60 days so it should pass the test but it doesnt. Ive been looking over my code and I just cant see what could be causing this
edit: here are the print results of current and retrieved timestamps:
current_timestamp = 1531849617.921927
retrieved_timestamp = 1489622400
two_month_seconds = 5184000
one_month_seconds = 2592000
Python3
from classes import helper
from classes import api
from classes import order
from procedures import orderReleaseProcedure
import time
import datetime
import re
def verifyCustomer(customer_id, order_id):
self_helper = helper.Helper()
customer_blocked_reasons = self_helper.getConfig('customer_blocked_reasons')
order_statuses = self_helper.getConfig('order_statuses')
customer_is_blocked = False
self_api = api.Api()
self_order =order.Order(order_id)
status = {
'success' : 0,
'message' :'verify_payment_method'
}
results = self_api.which_api('orders?customer_id={}'.format(customer_id))
order_count = results['total_count']
if order_count > 1:
for result in results['orders']:
order_status_info= self_api.which_api('order_statuses/%d' % result['order_status_id'])
for customer_blocked_reason in customer_blocked_reasons:
if customer_blocked_reason in order_status_info['name']:
customer_is_blocked = True
order_id = 0
order_date = result['ordered_at']
two_month_seconds = (3600 * 24) * 60
one_month_seconds = (3600 * 24) * 30
stripped_date = order_date[:order_date.find("T")]
current_timestamp = time.time()
retrieved_timestamp = int(datetime.datetime.strptime(stripped_date, '%Y-%m-%d').strftime("%s"))
if retrieved_timestamp > (current_timestamp - one_month_seconds) and not customer_is_blocked:
status['success'] = 1
status['message'] = "Customer Verified with orders older than 30 days and no blocking reasons"
print(' 30 day check was triggered ')
print(status)
break
elif customer_is_blocked:
status_change_result = self_order.update_status(order_statuses['order_hold_manager_review'])
status['success'] = 1
status['message'] = "Changed order status to Order Hold - Manager Review"
print(' Customer block was triggered ')
print(status_change_result)
break
elif not retrieved_timestamp < (current_timestamp - two_month_seconds):
status['success'] = 0
status['message'] = "There is more than 1 order, and none are greater than 60 days, we need to check manually"
print(' 60 day check was triggered ')
print(status)
break
return status

Doing operations on a large data set

I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.
I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me

Categories

Resources