I get this error when I run the script and I cannot see the solution. This program is supposed to draw a giveaway from a sqlite3 file which has the number of raffle tickets for a user. And recently the program the gives that creates the sqlite3 file updated some stuff (The script is made by me) and I can figure out the solution.
Traceback (most recent call last):
File "C:\Users\Admin\Desktop\Draw\Test\dave-draw.py", line 244, in <module>
dd = DaveDraw()
File "C:\Users\Admin\Desktop\Draw\Test\dave-draw.py", line 64, in __init__
self.get_viewers()
File "C:\Users\Admin\Desktop\Draw\Test\dave-draw.py", line 215, in
get_viewers
''').fetchall()
sqlite3.OperationalError: no such column: viewer_id
there's the code
#!/usr/bin/env python3
import pdb
import random
import sqlite3
class Viewer(object):
def __init__(self,
viewer_id,
twitch_name,
beam_name,
beam_id,
viewer_type,
rank,
points,
points2,
hours,
raids,
gains_currency,
gains_hours,
in_giveaways,
last_seen,
sub,
entrance_message,
entrance_message_type,
entrance_sfx
):
self.viewer_id = viewer_id
self.twitch_name = twitch_name
self.beam_name = beam_name
self.beam_id = beam_id
self.viewer_type = viewer_type
self.rank = rank
self.points = points
self.points2 = points2
self.hours = hours
self.raids = raids
self.gains_currency = gains_currency
self.gains_hours = gains_hours
self.in_giveaways = in_giveaways
self.last_seen = last_seen
self.sub = sub
self.entrance_message = entrance_message
self.entrance_message_type = entrance_message_type
self.entrance_sfx = entrance_sfx
def win_chance(self, total_tickets):
"""
Takes the total tickets (points) as a paramter and works
out the percentage chance that the viewer has of winning.
Returns the viewers win chance in percent.
"""
percent = total_tickets / 100.00
return self.points2 / percent
class DaveDraw(object):
def __init__(self):
self.debug = False
self.database_path = 'Viewers3DB.sqlite'
self.db_conn = sqlite3.connect(self.database_path)
self.get_viewers()
self.calculate_total_points()
self.assign_tickets()
def assign_tickets(self):
"""
Assigns each user a number range based on the number of
tickets they have.
e.g.
10 1-10
10 11-20
30 21-50
1 51
"""
self.tickets = {}
latest_ticket = 0
for viewer in self.viewers:
# skip anyone with no points
if viewer.points2 == 0:
continue
ticket_range_beg = latest_ticket + 1
ticket_range_end = latest_ticket + 1 + viewer.points2
latest_ticket = ticket_range_end
viewer.tickets = range(ticket_range_beg, ticket_range_end)
# assign a range of tickets:
if self.debug:
print("Assigning viewer twitch: %s beam: %s tickets %i-%i" % (viewer.twitch_name, viewer.beam_name, viewer.tickets.start, viewer.tickets.stop))
if ticket_range_beg == ticket_range_end:
if self.debug:
print("Assigning ticket {} to {}".format(ticket_range_beg, viewer.twitch_name))
self.tickets[ticket_range_beg] = viewer
next
for ticket in viewer.tickets:
if self.debug:
print("Assigning ticket {} to {}".format(ticket, viewer.twitch_name))
self.tickets[ticket] = viewer
def calculate_total_points(self):
"""
Gets the total amount of points awarded to all
viewers.
"""
self.total_points = 0
for viewer in self.viewers:
self.total_points += viewer.points2
self.total_points_percent = self.total_points / 100
print("Total points awarded (total tickets): %s" % self.total_points)
def draw(self):
"""
Picks a random number between 1 and total tickets, finds
the user that has been assigned tickets within that range and
returns the user.
"""
ticket = random.randint(1, self.total_points)
try:
winner = self.tickets[ticket]
except:
pdb.set_trace()
print("\n===== WINNER Twitch: {} / Beam: {} =====\n".format(winner.twitch_name, winner.beam_id))
print("Picked ticket {}\n".format(ticket))
print("Winner win chance: {:f}".format(winner.win_chance(self.total_points)))
print("Winner's ticket range: {}-{}".format(winner.tickets.start, winner.tickets.stop))
print("Winner's ticket amount: {}\n".format(winner.points2))
self.display_viewer(winner)
def display_random_viewer(self):
"""
Displays random viewer.
"""
self.display_viewer(self.get_random_viewer())
def display_viewer(self, viewer):
"""
Outputs the data on all viewers.
"""
print("""Viewer ID: %s\nTwitch Name: %s\nBeam Name: %s\nBeam ID: %s\nRank: %s\nPoints: %s\nPoints2: %s\nHours: %s\nRaids: %s\nGains Currency: %s\nGains Hours: %s\nInGiveaways: %s\nLastSeen: %s\nEntrance Message: %s\nEntranceMsgType: %s\nEntranceSFX: %s"""
% (
viewer.viewer_id,
viewer.twitch_name,
viewer.beam_name,
viewer.beam_id,
viewer.rank,
viewer.points,
viewer.points2,
viewer.hours,
viewer.raids,
viewer.gains_currency,
viewer.gains_hours,
viewer.in_giveaways,
viewer.last_seen,
viewer.entrance_message,
viewer.entrance_message_type,
viewer.entrance_sfx
)
)
def get_random_viewer(self):
"""
Gets a completely random viewer.
"""
return random.choice(self.viewers)
def get_viewers(self):
"""
Gets data on all the viewers in the database and stores
the data in self.viewers.
"""
c = self.db_conn.cursor()
viewers = c.execute('''
SELECT
viewer_id,
TwitchName,
BeamName,
BeamID,
Type,
Rank,
Points,
Points2,
Hours,
Raids,
GainsCurrency,
GainsHours,
InGiveaways,
LastSeen,
Sub,
EntranceMessage,
EntranceMsgType,
EntranceSFX
FROM Viewer
WHERE Type != 1
AND TwitchName NOT IN (
\'treeboydave\',
\'treebotdave\'
);
''').fetchall()
self.viewers = []
for cur_viewer in viewers:
self.viewers.append(
Viewer(
cur_viewer[0],
cur_viewer[1],
cur_viewer[2],
cur_viewer[3],
cur_viewer[4],
cur_viewer[5],
cur_viewer[6],
cur_viewer[7],
cur_viewer[8],
cur_viewer[9],
cur_viewer[10],
cur_viewer[11],
cur_viewer[12],
cur_viewer[13],
cur_viewer[14],
cur_viewer[15],
cur_viewer[16],
cur_viewer[17]
)
)
if __name__ == '__main__':
dd = DaveDraw()
dd.draw()
All your other SQL columns are capitalised, any chance that's why it's not finding the viewer_id column? Maybe it's Viewer_Id or similar?
If you sql execute 'HELP TABLE Viewer' and print what it returns, it will give you an outline of all of the columns in that database table, so you can make sure you have the capitalisation correct, or whether the column actually isn't there at all.
Related
I have a python code which works for doing data analytics from csv file. I want to run my python code to be run periodically on a docker container. Every 15 seconds, it should automatically look at a folder A, if there is a csv file in it, it should process it and put an html report with the same name in folder B.
HERE IS MY PYTHON CODE .
#This program pulls data from csv file and displays it as html file.
#csv file contains device names, card names and temperatures of cards
#The html file contains: how many devices, how many cards are in the system, which
#device has the highest temperature card, and in the table below is how many cards are
#there in summary for each device, how many have a temperature of 70 and above, the
#highest and average card what are the temperatures
#NOTE: The print functions in the program are written for trial purposes.
from enum import unique
from re import A, T
import pandas as pd
from prettytable import PrettyTable, PLAIN_COLUMNS
table = PrettyTable() #create a table for device
table2 = PrettyTable() #create a table for summary
table.field_names = ["Device -", "Total # of Cards - ", "High Temp. Cards # - ", "Max Temperature - ", "Avg. Temperature "]
table2.field_names = [" "," "]
df = pd.read_csv("cards.csv", sep=';', usecols = ['Device','Card','Temperature'])""", index_col=["Device","Card"]"""
print(type(df))
print(df["Device"].nunique(),"\n\n") # number of unique server
total_devices = df["Device"].nunique() # NUMBER OF DEVICES IN DIFFERENT TYPES
print(total_devices)
print(df["Device"].loc[1],"\n\n")
print(df['Temperature'].max(),"\n\n")
maxTemp = df['Temperature'].max() #finding max temperature
print("total card ", )
i= 0
j=1
#Finding the card with the max temperature and the server where the card is located
while j > 0:
if df["Temperature"].loc[i] == df["Temperature"].max():
print(df["Device"].loc[i])
print(df["Card"].loc[i])
deviceName = df["Device"].loc[i]
cardName = df["Card"].loc[i]
j= 0
else :
i = i+1
dev_types = df["Device"].unique() # Server's names
print("\n\n")
newstr = cardName + "/" + deviceName
#Summary tablosunu olusturma
table2.add_row(["Total Devices ", total_devices] )
table2.add_row(["Total Cads ", len(df["Card"])])
table2.add_row(["Max Card Temperature ", df["Temperature"].max()])
table2.add_row(["Hottest Card / Device " ,newstr])
print(table2)
row_num = len(df)
print(row_num)
#I pulled the data from the file according to the device type so that the server cards and temperatures were sorted, I found the max temp from here
dn = pd.read_csv("cards.csv", sep=';', index_col=["Device"], usecols = ['Device','Card','Temperature'])
sum = []
high = []
#print("max temp: ", dn["Temperature"].loc[dev_types[1]].max())
for x in range(total_devices): # total devices (according the file = 3 )
print("\n")
cardCount = 0 # counts the number of cards belonging to the device
count2 = 0 # Counts the number of cards with a temperature greater than 70
tempcount = 0
print(dev_types[x])
for y in range(row_num):
if dev_types[x] == df["Device"].loc[y]:
print(df["Temperature"].loc[y])
tempcount = tempcount + df["Temperature"].loc[y] # the sum of the temperatures of the cards(used when calculating the average)
cardCount = cardCount +1
if df["Temperature"].loc[y] >= 70:
count2 = count2 +1
maxT = dn["Temperature"].loc[dev_types[x]].max() #Finding the ones with the max temperature from the cards belonging to the server
avg = str(tempcount/cardCount)
print("avg",avg)
table.add_row([dev_types[x], cardCount, count2, maxT,avg ]) # I added the information to the "devices" table
print("num of cards" , cardCount)
print("high temp cards" , count2)
print("\n\n")
print("\n\n")
print(table)
htmlCode = table.get_html_string()
htmlCode2 = table2.get_html_string()
f= open('devices.html', 'w')
f.write("SUMMARY")
f.write(htmlCode2)
f.write("DEVICES")
f.write(htmlCode)
Whether or not the code is run in Docker doesn't matter.
Wrap all of that current logic (well, not the imports and so on) in a function, say, def process_cards().
Call that function forever, in a loop:
import logging
def process_cards():
table = PrettyTable()
...
def main():
logging.basicConfig()
while True:
try:
process_cards()
except Exception:
logging.exception("Failed processing")
time.sleep(15)
if __name__ == "__main__":
main()
As an aside, your data processing code can be vastly simplified:
import pandas as pd
from prettytable import PrettyTable
def get_summary_table(df):
summary_table = PrettyTable() # create a table for summary
total_devices = df["Device"].nunique()
hottest_card = df.loc[df["Temperature"].idxmax()]
hottest_device_desc = f"{hottest_card.Card}/{hottest_card.Device}"
summary_table.add_row(["Total Devices", total_devices])
summary_table.add_row(["Total Cards", len(df["Card"])])
summary_table.add_row(["Max Card Temperature", df["Temperature"].max()])
summary_table.add_row(["Hottest Card / Device ", hottest_device_desc])
return summary_table
def get_devices_table(df):
devices_table = PrettyTable(
[
"Device",
"Total # of Cards",
"High Temp. Cards #",
"Max Temperature",
"Avg. Temperature",
]
)
for device_name, group in df.groupby("Device"):
count = len(group)
avg_temp = group["Temperature"].mean()
max_temp = group["Temperature"].max()
high_count = group[group.Temperature >= 70]["Temperature"].count()
print(f"{device_name=} {avg_temp=} {max_temp=} {high_count=}")
devices_table.add_row([device_name, count, high_count, max_temp, avg_temp])
return devices_table
def do_processing(csv_file="cards.csv", html_file="devices.html"):
# df = pd.read_csv(csv_file, sep=';', usecols=['Device', 'Card', 'Temperature'])
# (Just some random example data)
df = pd.DataFrame({
"Device": [f"Device {1 + x // 3}" for x in range(10)],
"Card": [f"Card {x + 1}" for x in range(10)],
"Temperature": [59.3, 77.2, 48.5, 60.1, 77.2, 61.1, 77.4, 65.8, 71.2, 60.3],
})
summary_table = get_summary_table(df)
devices_table = get_devices_table(df)
with open(html_file, "w") as f:
f.write(
"<style>table, th, td {border: 1px solid black; border-collapse: collapse;}</style>"
)
f.write("SUMMARY")
f.write(summary_table.get_html_string(header=False))
f.write("DEVICES")
f.write(devices_table.get_html_string())
do_processing()
i have an example of repeat decorator for run your function every seconds or minutes ...
i hope this sample helps you
from typing import Optional, Callable, Awaitable
import asyncio
from functools import wraps
def repeat_every(*, seconds: float, wait_first: bool = False)-> Callable:
def decorator(function: Callable[[], Optional[Awaitable[None]]]):
is_coroutine = asyncio.iscoroutinefunction(function)
#wraps(function)
async def wrapped():
async def loop():
if wait_first:
await asyncio.sleep(seconds)
while True:
try:
if is_coroutine:
await function()
else:
await asyncio.run_in_threadpool(function)
except Exception as e:
raise e
await asyncio.sleep(seconds)
asyncio.create_task(loop())
return wrapped
print("Repeat every working well.")
return decorator
#repeat_every(seconds=2)
async def main():
print(2*2)
try:
loop = asyncio.get_running_loop()
except RuntimeError:
loop = None
if loop and loop.is_running():
print('Async event loop already running.')
tsk = loop.create_task(main())
tsk.add_done_callback(
lambda t: print(f'Task done with result= {t.result()}'))
else:
print('Starting new event loop')
asyncio.run(main())
and there is an option that you can make an entrypoint which has cronjob
I want to create comments from a dataset that details the growth rate, market share, etc for various markets and products. The dataset is in the form of a pd.DataFrame(). I would like the comment to include keywords like increase/decrease based on the calculations, for example, if 2020 Jan has sale of 1000, and 2021 Jan has a sale of 1600, then it will necessary mean an increase of 60%.
I defined a function outside as such and I would like to seek if this method is too clumsy, if so, how should I improve on it.
GrowthIncDec = namedtuple('gr_tuple', ['annual_growth_rate', 'quarterly_growth_rate'])
def increase_decrease(annual_gr, quarter_gr):
if annual_gr > 0:
annual_growth_rate = 'increased'
elif annual_gr < 0:
annual_growth_rate = 'decreased'
else:
annual_growth_rate = 'stayed the same'
if quarter_gr > 0:
quarterly_growth_rate = 'increased'
elif quarter_gr < 0:
quarterly_growth_rate = 'decreased'
else:
quarterly_growth_rate = 'stayed the same'
gr_named_tuple = GrowthIncDec(annual_growth_rate=annual_growth_rate, quarterly_growth_rate=quarterly_growth_rate)
return gr_named_tuple
myfunc = increase_decrease(5, -1)
myfunc.annual_growth_rate
output: 'increased'
A snippet of my main code is as follows to illustrate the use of the above function:
def get_comments(grp, some_dict: Dict[str, List[str]]):
.......
try:
subdf = the dataframe
annual_gr = subdf['Annual_Growth'].values[0]
quarter_gr = subdf['Quarterly_Growth'].values[0]
inc_dec_named_tup = increase_decrease(annual_gr, quarter_gr)
inc_dec_annual_gr = inc_dec_named_tup.annual_growth_rate
inc_dec_quarterly_gr = inc_dec_named_tup.quarterly_growth_rate
comment = "The {} has {} by {:.1%} in {} {} compared to {} {}"\
.format(market, inc_dec_annual_gr, annual_gr, timeperiod, curr_date, timeperiod, prev_year)
comments_df = pd.DataFrame(columns=['Date','Comments'])
# comments_df['Date'] = [curr_date]
comments_df['Comments'] = [comment]
return comments_df
except (IndexError, KeyError) as e:
# this is for all those nan values which is empty
annual_gr = 0
quarter_gr = 0
I've been working on a python script and am having issues with some verification's I set up. I have this procedure file that has a function that uses a order number and a customer number to check some past history about the customers orders. Ive been testing live on our server and I keep failing the last if statement. The order number and customer number Im using does have more than one order and some are over 60 days so it should pass the test but it doesnt. Ive been looking over my code and I just cant see what could be causing this
edit: here are the print results of current and retrieved timestamps:
current_timestamp = 1531849617.921927
retrieved_timestamp = 1489622400
two_month_seconds = 5184000
one_month_seconds = 2592000
Python3
from classes import helper
from classes import api
from classes import order
from procedures import orderReleaseProcedure
import time
import datetime
import re
def verifyCustomer(customer_id, order_id):
self_helper = helper.Helper()
customer_blocked_reasons = self_helper.getConfig('customer_blocked_reasons')
order_statuses = self_helper.getConfig('order_statuses')
customer_is_blocked = False
self_api = api.Api()
self_order =order.Order(order_id)
status = {
'success' : 0,
'message' :'verify_payment_method'
}
results = self_api.which_api('orders?customer_id={}'.format(customer_id))
order_count = results['total_count']
if order_count > 1:
for result in results['orders']:
order_status_info= self_api.which_api('order_statuses/%d' % result['order_status_id'])
for customer_blocked_reason in customer_blocked_reasons:
if customer_blocked_reason in order_status_info['name']:
customer_is_blocked = True
order_id = 0
order_date = result['ordered_at']
two_month_seconds = (3600 * 24) * 60
one_month_seconds = (3600 * 24) * 30
stripped_date = order_date[:order_date.find("T")]
current_timestamp = time.time()
retrieved_timestamp = int(datetime.datetime.strptime(stripped_date, '%Y-%m-%d').strftime("%s"))
if retrieved_timestamp > (current_timestamp - one_month_seconds) and not customer_is_blocked:
status['success'] = 1
status['message'] = "Customer Verified with orders older than 30 days and no blocking reasons"
print(' 30 day check was triggered ')
print(status)
break
elif customer_is_blocked:
status_change_result = self_order.update_status(order_statuses['order_hold_manager_review'])
status['success'] = 1
status['message'] = "Changed order status to Order Hold - Manager Review"
print(' Customer block was triggered ')
print(status_change_result)
break
elif not retrieved_timestamp < (current_timestamp - two_month_seconds):
status['success'] = 0
status['message'] = "There is more than 1 order, and none are greater than 60 days, we need to check manually"
print(' 60 day check was triggered ')
print(status)
break
return status
I'm using PyQt for a GUI software. I also use a sqlite database to feed the software with data.
Somewhere in my code, I have this method:
def loadNotifications(self):
"""Method to find the number of unread articles,
for each search. Load a list of id, for the unread articles,
in each table. And a list of id, for the concerned articles, for
each table"""
count_query = QtSql.QSqlQuery(self.bdd)
count_query.setForwardOnly(True)
# Don't treat the articles if it's the main tab, it's
# useless because the article will be concerned for sure
for table in self.list_tables_in_tabs[1:]:
# Empty these lists, because when loadNotifications is called
# several times during the use, the nbr of unread articles is
# added to the nbr of notifications
table.list_new_ids = []
table.list_id_articles = []
# Try to speed things up
append_new = table.list_new_ids.append
append_articles = table.list_id_articles.append
req_str = self.refineBaseQuery(table.base_query, table.topic_entries, table.author_entries)
print(req_str)
count_query.exec_(req_str)
start_time = datetime.datetime.now()
i = 0
while count_query.next():
i += 1
record = count_query.record()
append_articles(record.value('id'))
if record.value('new') == 1:
append_new(record.value('id'))
print(datetime.datetime.now() - start_time)
print("Nbr of entries processed: {}".format(i))
Let's assume this loop has ~400 entries to process. It takes about one second, and I think it's too long. I tried to optimize the process as much as I could, but it still takes too much time.
Here is what the previous method typically prints:
SELECT * FROM papers WHERE id IN(320, 1320, 5648, 17589, 20092, 20990, 49439, 58378, 65251, 68772, 73509, 86859, 90594)
0:00:00.001403
Nbr of entries processed: 13
SELECT * FROM papers WHERE topic_simple LIKE '% 3D print%'
0:00:00.591745
Nbr of entries processed: 81
SELECT * FROM papers WHERE id IN (5648, 11903, 14258, 30587, 40339, 55691, 57383, 58378, 62951, 65251, 68772, 87295)
0:00:00.000478
Nbr of entries processed: 12
SELECT * FROM papers WHERE topic_simple LIKE '% Python %'
0:00:00.596490
Nbr of entries processed: 9
SELECT * FROM papers WHERE topic_simple LIKE '% Raspberry pi %' OR topic_simple LIKE '% arduino %'
0:00:00.988276
Nbr of entries processed: 5
SELECT * FROM papers WHERE topic_simple LIKE '% sensor array%' OR topic_simple LIKE '% biosensor %'
0:00:00.996164
Nbr of entries processed: 433
SELECT * FROM papers WHERE id IN (320, 540, 1320, 1434, 1860, 4527, 5989, 6022, 6725, 6978, 7268, 8625, 9410, 9814, 9850, 10608, 13219, 15572, 15794, 19345, 19674, 19899, 20990, 22530, 26443, 26535, 28721, 29089, 30923, 31145, 31458, 31598, 32069, 34129, 35820, 36142, 36435, 37546, 39188, 39952, 40949, 41764, 43529, 43610, 44184, 45206, 49210, 49807, 50279, 50943, 51536, 51549, 52921, 52967, 54610, 56036, 58087, 60490, 62133, 63051, 63480, 63535, 64861, 66906, 68107, 68328, 69021, 71797, 73058, 74974, 75331, 77697, 78138, 80152, 80539, 82172, 82370, 82840, 86859, 87467, 91528, 92167)
0:00:00.002891
Nbr of entries processed: 82
SELECT * FROM papers WHERE id IN (7043, 41643, 44688, 50447, 64723, 72601, 81006, 82380, 84285)
0:00:00.000348
Nbr of entries processed: 9
Is this the better way ? Can I get better results ?
NOTE: the time displayed is the time needed to run the loop, not the time needed to run the query.
I tried count_query.setForwardOnly(True), as mentioned in the doc, but it had no effect on the perfs.
EDIT:
Here is test database with ~600 entries:
database
Obviously I can't test this, so I don't know if it will make a significant difference, but you could try using index-based look-ups:
id_index = count_query.record().indexOf('id')
new_index = count_query.record().indexOf('new')
while count_query.next():
record = count_query.record()
id_value = record.value(id_index)
append_articles(id_value)
if record.value(new_index) == 1:
append_new(id_value)
UPDATE:
Using your sample db, I cannot reproduce the issue you are seeing, and I also found my method above is about twice as fast as your original one. Here's some sample output:
IDs: 660, Articles: 666
IDs: 660, Articles: 666
IDs: 660, Articles: 666
test(index=False): 0.19050272400090762
IDs: 660, Articles: 666
IDs: 660, Articles: 666
IDs: 660, Articles: 666
test(index=True): 0.09384496400161879
Test case:
import sys, os, timeit
from PyQt4 import QtCore, QtGui
from PyQt4.QtSql import QSqlDatabase, QSqlQuery
def test(index=False):
count_query = QSqlQuery('select * from results')
list_new_ids = []
list_id_articles = []
append_new = list_new_ids.append
append_articles = list_id_articles.append
if index:
id_index = count_query.record().indexOf('id')
new_index = count_query.record().indexOf('new')
while count_query.next():
record = count_query.record()
id_value = record.value(id_index)
append_articles(id_value)
if record.value(new_index) == 1:
append_new(id_value)
else:
while count_query.next():
record = count_query.record()
append_articles(record.value('id'))
if record.value('new') == 1:
append_new(record.value('id'))
print('IDs: %d, Articles: %d' % (
len(list_new_ids), len(list_id_articles)))
class Window(QtGui.QWidget):
def __init__(self):
super(Window, self).__init__()
self.button = QtGui.QPushButton('Test', self)
self.button.clicked.connect(self.handleButton)
layout = QtGui.QVBoxLayout(self)
layout.addWidget(self.button)
self.database = QSqlDatabase.addDatabase("QSQLITE")
path = os.path.join(os.path.dirname(__file__), 'tmp/perf-test.db')
self.database.setDatabaseName(path)
self.database.open()
def handleButton(self):
for stmt in 'test(index=False)', 'test(index=True)':
print('%s: %s' % (stmt, timeit.timeit(
stmt, 'from __main__ import test', number=3)))
if __name__ == '__main__':
import sys
app = QtGui.QApplication(sys.argv)
window = Window()
window.setGeometry(600, 300, 200, 100)
window.show()
sys.exit(app.exec_())
I have to perform some analysis on a PSL record which contains information on DNA sequence fragments. Basically I have to find entries that are from the same read in the same contig (these are both values in the PSL entry). The problem is the PSL records are large (10-30 Mb text documents). I wrote a program that works on short records and on the long records given enough time but it took way longer than specified. I was told the program shouldn't take more than ~15 seconds. Mine took over 15 minutes.
PSL records look like this:
275 11 0 0 0 0 0 0 - M02034:35:000000000-A7UU0:1:1101:19443:1992/2 286 0 286 NODE_406138_length_13407_cov_13.425076 13465 408 694 1 286, 0, 408,
171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,
188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,
My code looks like this:
import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):
object attributes:
fname: the initial file name
methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt
readPSLpairs() : yields psl pairs that support a circular hypothesis
Author: David Bernick
Date: May 12, 2013
'''
def __init__ (self, fname=''):
'''contructor: saves attribute fname '''
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.
This method selects for alignments that could may be part of a
circle.
Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).
For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus
so...
any read(+) near the 3', or
any read(-) near the 5'
'''
nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:
for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])
if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue
yield line
def readPSLpairs (self):
read1 = []
read2 = []
for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)
output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break
print(output)
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()
I was given some example code that looks like this:
def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]
for name, instrument, num in thisStream:
if name != thisGroup:
doSomethingPairwise(thisGroupList)
thisGroup = name
thisGroupList = [ [], [], [] ]
thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)
But when I tried to implement it my program still took a long time. Am I thinking about this the wrong way? I realize the nested loop is slow but I don't see an alternative.
Edit: I figured it out, the data was presorted which made my brute force solution very impractical and unnecessary.
I hope help you, since, the question needs a best input example file
#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))
class PSLreader :
def __init__ (self, fname=''):
self.fname = fname
def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)
def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc
#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl
#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True
PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()
from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)
output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1
Note: 10-30 Mb isn't large file, if you ask me