So I've managed to finally figure out how to select a portion of a pandas dataframe, but now I am lost on how to use the data with in it. I want to be able to add together all of the entries within a minute.
for minute in rrule.rrule(rrule.MINUTELY, dtstart=pd.datetime.strptime(dateToday+"T15:30","%Y-%m-%dT%H:%M"), until=pd.datetime.strptime(dateToday+"T22:00","%Y-%m-%dT%H:%M")):
temp = pageData[(pageData['time']>=minute)&(pageData['time']<minute+timedelta(seconds=60))]
print(temp)
Figured it out. What I needed was a loop cycling through the created subpandas (whatever you call them).
#Refrences
from time import *
from dateutil import rrule
from datetime import timedelta
import urllib.request as web
import pandas as pd
import os
forToday = 'http://netfonds.no/quotes/tradedump.php?csv_format=csv&paper='
dateToday = strftime("%Y-%m-%d", localtime())
#dateToday = "2014-10-07"
def pullToday(exchange,stock):
hold=[]
fileName=('data/'+exchange+'/'+stock+'/'+dateToday+'.txt')
try:
if not os.path.isdir(os.path.dirname(fileName)):
os.makedirs(os.path.dirname(fileName))
except OSError:
print("Something went very wrong. Review the dir creation section")
pageBuffer=web.urlopen(forToday+stock+'.'+exchange)
pageData=pd.read_csv(pageBuffer,usecols=['time','price','quantity'])
for i in pageData.index:
pageData['time'][i]=pd.datetime.strptime(pageData['time'][i],'%Y%m%dT%H%M%S')
print(hold)
#pageData = pageData.set_index('time',drop=False)
for minute in rrule.rrule(rrule.MINUTELY, dtstart=pd.datetime.strptime(dateToday+"T15:30","%Y-%m-%dT%H:%M"), until=pd.datetime.strptime(dateToday+"T21:58","%Y-%m-%dT%H:%M")):
temp = pageData[(pageData['time']>=minute)&(pageData['time']<minute+timedelta(seconds=60))]
volume=0
priceSum=0
low=123456
high=0
for i in temp.index:
volume+=temp['quantity'][i]
priceSum+=temp['quantity'][i]*temp['price'][i]
if temp['price'][i]>high:
high=temp['price'][i]
if temp['price'][i]<low:
low=temp['price'][i]
priceSum/=volume
hold.append([minute,volume,low,high,round(priceSum,4)])
minute=pd.datetime.strptime(dateToday+"T21:59","%Y-%m-%dT%H:%M")
temp = pageData[(pageData['time']>=minute)&(pageData['time']<minute+timedelta(seconds=180))]
volume=0
priceSum=0
low=123456
high=0
for i in temp.index:
volume+=temp['quantity'][i]
priceSum+=temp['quantity'][i]*temp['price'][i]
if temp['price'][i]>high:
high=temp['price'][i]
if temp['price'][i]<low:
low=temp['price'][i]
priceSum/=volume
hold.append([minute,volume,low,high,round(priceSum,4)])
compiledData=pd.DataFrame(hold ,columns=['TimeStamp', 'Volume', 'Low', 'High', 'Average'])
#for i in compiledData.index:
#compiledData['TimeStamp'][i]-=pd.datetime.strptime(dateToday+"TZ06","%Y-%m-%dTZ%H")
print(compiledData)
dataFile = open(fileName,'w')
dataFile.write('#Format: Timestamp;Volume;Low;High;Median\n')
dataFile.close()
pageData.to_csv(fileName,index=False,sep=';',mode='a',header=False)
def getList(fileName):
stockList = []
file = open(fileName+'.txt', 'r').read()
fileByLines = file.split('\n')
for eachLine in fileByLines:
if '#' not in eachLine:
lineByValues = eachLine.split('.')
stockList.append(lineByValues)
return stockList
start_time = time()
stockList = getList('stocks')
#for eachEntry in stockList:
# pullToday(eachEntry[0],eachEntry[1])
pullToday('O','AAPL')
delay=str(round((time()-start_time)))
print('Finished in ' + delay)
Related
I'm trying to getting data from kraken exchange by the krakenex API. But i'm facing several problems, 'cause, I want getting the data in a range time bigger than the alllowed by the API.
The API only allows getting a dataframe with 720 rows, so 'cause that I need to do a loop while to getting more data and concat in another dataframe.
I've already read other topics about it, but I'm still not reaching good results.
import krakenex
import time
import krakenex
import pandas as pd
from pykrakenapi import KrakenAPI
from datetime import datetime
k = krakenex.API()
start = '28/01/2021 00:00:00'
start = datetime.strptime(start, "%d/%m/%Y %H:%M:%S")
start = int(time.mktime(start.timetuple()))
stop = '03/02/2021 00:00:00'
stop = datetime.strptime(stop, "%d/%m/%Y %H:%M:%S")
stop = int(time.mktime(stop.timetuple()))
prices = pd.DataFrame()
while start < stop:
time.sleep(5)
data = k.query_public('OHLC', {'pair':'XXBTZUSD', 'interval':1, 'since':start})
df = pd.DataFrame( data['result']['XXBTZUSD'])
daily_prices = df[0].to_list()
start = int(daily_prices[0])
prices = pd.concat([precos , df])
For weeks I have been working on a script that does exactly that. In my case I collect all pairs with BTC and ETH but you can use the script with any pair. To do this I used the REST API and defined some functions that automate everything. I download the data with 1 minute timeframe but it can be used for any timeframe.
First I defined a function that downloads the data in full or from a specific date, it's necessary because at the first run it will download all the data and then it will download only the new data. The parameter 'interval' defines the number of minutes of the timeframe while 'since' defines the beginning of the data to download.
def get_ohlc (pair, interval=1, since='last'):
endpoint = 'https://api.kraken.com/0/public/OHLC'
payLoad = {
'pair': pair,
'interval': interval,
'since' : since
}
response = requests.get(endpoint, payLoad)
data = response.json()
OHLC = data['result'][pair]
data = pd.DataFrame.from_records(OHLC, columns=['Time', 'Open', 'High', 'Low', 'Close', 'vwap', 'volume', 'count'])
data['Time'] = pd.to_datetime(data['Time'], unit='s')
data.set_index('Time',inplace=True)
data = data.drop(['vwap', 'volume', 'count'], axis=1)
data['Open'] = data.Open.astype(float)
data['High'] = data.High.astype(float)
data['Low'] = data.Low.astype(float)
data['Close'] = data.Close.astype(float)
return data
Then I defined a function to load the .json file that was saved into memory. The function returns the dataframe with the old data and a timestamp that indicates from where to download the new data. I also created a function for calculate the timestamp.
def load_data(pair, path):
data = pd.read_json(path + pair + '.json' , orient='split')
tmp = data.tail(1).index
tmp = tmp.strftime('%Y-%m-%d %H:%M:%S')
dt = str_to_datetime(tmp[0])
ts = dt.timestamp()
return data, ts
def str_to_datetime(datestr):
Y = int(datestr[0:4])
M = int(datestr[5:7])
D = int(datestr[8:10])
H = int(datestr[11:13])
m = int(datestr[14:16])
return datetime.datetime(Y, M, D, H, m, 0, tzinfo=tz.gettz("Etc/GMT"))
Now your main should be something like:
from countdown import countdown
import pandas as pd
import datetime
import os
path = os.getcwd() + '/historical_data/'
pair = 'XBTUSD'
while True:
if os.path.exists(path + pair + '.json') == False:
data = get_ohlc(pair, 1) # 1 minute timeframe
data.to_json(path + pair + '.json', orient='split')
else:
data1, ts = load_data(pair, path)
data2 = get_ohlc(pair, 1, ts)
data3 = pd.concat([data1, data2])
data3.drop(data3.tail(1).index,inplace=True) # delete last record because it's not ended
data3.to_json(path + pair + '.json', orient='split')
countdown(60) # update every hour
I delete the last record because when you download it it's not ended so we will download at the next update. I haven't tested if it works because I took pieces of code from my program, if it doesn't work let me know and I'll fix it.
My goal is to use a list of tickers together with the TWS API to extract parts of the company snapshot (reqFundamentalData() -> "ReportSnapshot") and the financial statements (reqFundamentalData() -> "ReportsFinStatements") of these tickers, convert into a dataframe and store it as a parquet file.
I tried to merge solutions provided:
use a list of tickers
TWS API to download stock fundamental data runs only the first data entry and ignores the others. Whow to solve this?
store XML as dataframe
Converting XML to Pandas
store data
Save data from TWS API to csv file
Code:
from datetime import datetime
from bs4 import BeautifulSoup as bs
import pandas as pd
from ibapi.client import EClient
from ibapi.contract import Contract
from ibapi.wrapper import EWrapper
import logging
import random
import pathlib
import time
from datetime import date
import datetime
from pathlib import Path
class TestApp(EWrapper, EClient):
def __init__(self, addr, port, client_id):
EWrapper.__init__(self) # new - book
EClient.__init__(self, self)
self.firstReqId = 8001
self.contracts = {} # keep in dict so you can lookup
self.contNumber = self.firstReqId
# add dataframes to store the result
self.df_company_info = pd.DataFrame(data=None, index=None, columns=None)
self.df_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
def addContracts(self, cont):
self.contracts[self.contNumber] = cont # add to dict using 8001 first time
self.contNumber += 1 # next id will be 8002 etc.
def nextValidId(self, orderId: int):
# now you are connected, ask for data, no need for sleeps
# this isn't the only way to know the api is started but it's what IB recommends
self.contNumber = self.firstReqId # start with first reqId
self.getNextData()
def error(self, reqId, errorCode, errorString):
print("Error: ", reqId, "", errorCode, "", errorString)
# if there was an error in one of your requests, just contimue with next id
if reqId > 0 and self.contracts.get(self.contNumber):
# err in reqFundametalData based on reqid existing in map
print('err in', self.contracts[reqId].symbol)
self.getNextData() # try next one
def fundamentalData(self, reqId, fundamental_data):
self.fundamentalData = fundamental_data
try:
if self.fundamentalData is not None:
# convert XML to dictionary entry
dict_company_info = self.CompanyInfoXMLtoDict(self.fundamentalData)
# add dict entry to dataframe
df_add_row = pd.DataFrame([dict_company_info])
self.df_company_info = self.df_company_info.append(df_add_row, ignore_index=True)
except KeyError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except TypeError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except ValueError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except IndexError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
self.getNextData()
def getNextData(self):
if self.contracts.get(self.contNumber): # means a contract exists
# so req data
self.reqFundamentalData(self.contNumber, self.contracts[self.contNumber], "ReportSnapshot", [])
self.contNumber += 1 # now get ready for next request
else: # means no more sequentially numbered contracts
print('done')
self.disconnect() # just exit
def CompanyInfoXMLtoDict(self, fundamentals):
soup = bs(fundamentals, 'xml')
df_company_info = pd.DataFrame(data=None, index=None, columns=None)
ticker = ''
longName = ''
fullTimeEmployees = 0
# search for a tag e.g. </IssueID>
for issues in soup.find_all('IssueID'):
# within this tag -> search of unique ID e.g. IssueID type=...
if issues.get('Type') == "Ticker":
ticker = issues.get_text()
break
for coID_i in soup.find_all('CoID'):
if coID_i.get('Type') == "CompanyName":
longName = coID_i.get_text()
break
for employees_i in soup.find_all('Employees'):
fullTimeEmployees = employees_i.get_text()
break
# create result entry row
if ticker is not None and ticker != '':
new_row_dict = {'ticker': ticker, 'name': longName,
'num_employees': fullTimeEmployees}
else:
new_row_dict = {}
return new_row_dict
def FinStmtsXMLtoDF(self, fundamentals, ticker, stmts_type):
today = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
today_date = date.today().strftime("%Y-%m-%d")
if stmts_type == 'annual':
period_type = 'Annual'
else:
period_type = 'Interim'
soup = bs(fundamentals, 'xml')
# build dict
stmts_terms = {}
for terms in soup.find_all("mapItem"):
# add entry to dict -> dict for maping of code to description
stmts_terms[terms.get('coaItem')] = terms.get_text()
bal_l = []
inc_l = []
cas_l = []
for period in soup.find_all('FiscalPeriod'):
# quarterly vs. annually
if period.get('Type') == period_type:
for statement in period.find_all('Statement'):
if statement.find('UpdateType').get('Code') != 'CLA':
dic = {}
stmts_type = statement.get('Type')
# source_date = statement.find('Source').get('Date')
statement_date = statement.find('StatementDate').text
# dic['date'] = source_date
dic['rep_date'] = statement_date
for item in statement.find_all('lineItem'):
# dic[item.get('coaCode')] = item.text
dic[stmts_terms.get(item.get('coaCode'), 'DEFAULT')] = item.text
if stmts_type == 'BAL':
bal_l.append(dic)
# print(stmts_type, date, dic)
elif stmts_type == 'INC':
inc_l.append(dic)
elif stmts_type == 'CAS':
cas_l.append(dic)
df_balance_sheet = pd.DataFrame(bal_l).sort_values('rep_date')
df_income_statement = pd.DataFrame(inc_l).sort_values('rep_date')
df_cash_flow = pd.DataFrame(cas_l).sort_values('rep_date')
# merge all stmts for same rep_date
df_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
df_fin_stmts = df_balance_sheet.merge(df_income_statement, how='left',
left_on=['rep_date'],
right_on=['rep_date'])
df_fin_stmts = df_fin_stmts.merge(df_cash_flow, how='left',
left_on=['rep_date'],
right_on=['rep_date'])
df_fin_stmts.insert(loc=0, column='ticker', value=ticker)
df_fin_stmts.insert(loc=1, column='date_updated', value=today_date)
return df_fin_stmts
def main():
# ----- config
project_data_folder = '/home/data/'
project_data_folder = Path(project_data_folder)
# ticker are stored in a csv file
csv_master_ticker = Path('home/data/ticker/ticker-list.csv')
# load list of tickers
df = pd.read_csv(csv_master_ticker)
list_master_ticker = df['ticker'].tolist()
fusion_company_info = pd.DataFrame(data=None, index=None, columns=None)
fusion_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
fusion_q_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
client = TestApp('127.0.0.1', 7496, 0)
for ticker in list_master_ticker:
# remove additional postfix for exchange e.g. XYZ.F -> XYZ
ticker_clean = ticker.rstrip('.')
contract = Contract()
contract.symbol = ticker_clean
contract.secType = 'STK'
contract.exchange = "SMART"
contract.currency = 'USD'
client.addContracts(contract)
client.connect('127.0.0.1', 7496, 0)
client.run()
if fusion_company_info.empty:
fusion_company_info = client.df_company_info
else:
fusion_company_info = pd.concat([fusion_company_info, client.df_company_info])
tws_company_info_file_name = 'tws_company_info.parquet'
file_name = project_data_folder / tws_company_info_file_name
try:
if fusion_company_info is not None:
if not fusion_company_info.empty:
fusion_company_info.to_parquet(file_name, engine='pyarrow')
# financial statements - annual
tws_fin_stmts_file_name = 'tws_fin_stmts.parquet'
file_name = project_data_folder / tws_fin_stmts_file_name
try:
if fusion_fin_stmts is not None:
if not fusion_fin_stmts.empty:
fusion_fin_stmts.to_parquet(file_name, engine='pyarrow')
I get an error message
Traceback (most recent call last):
File "...\ibapi\client.py", line 239, in run
self.decoder.interpret(fields)
File "...\ibapi\decoder.py", line 1278, in interpret
self.interpretWithSignature(fields, handleInfo)
File "...\ibapi\decoder.py", line 1259, in interpretWithSignature
method(*args)
TypeError: 'str' object is not callable
python-BaseException
Can someone help me with this error message?
If I remove the for loop and run it only for a single ticker e.g.
client.contracts = {}
contract = Contract()
contract.symbol = 'AMD'
contract.secType = 'STK'
contract.currency = 'USD'
contract.exchange = "SMART"
client.addContracts(contract)
client.connect('127.0.0.1', 7496, 0)
client.run()
I don't get a error message and the dataframe self.company_info get's populated with the correct data of AMD.
General questions:
Is it possible to get via reqFundamentalData() not only the company info "ReportSnapshot", but also the financial statements "ReportsFinStatements" (df_fin_stmts and the function "FinStmtsXMLtoDF") in one request/run ?
I new to python and would expect functions are only executed, if the functions is called within the code, but somehow with the the TWS API (socket, reqID) it seems to work different and it's not fully clear to me when which funciton is called after one and another.
e.g. how do I know that by executing reqFundamentalData() the function fundamentalData() is called. Or e.g. nextValidID() is somehow triggered, but not explicit called within the program. Is there a good tutorial to introduce the process of what functions are called in which order?
Thank you very much
I've made a lot of progress on this, and can now download 3 of 4 files just fine, however, one, the Wisconsin file contains timestamps that I can't have removed, and vary day to day and I'm struggling to figure out how to get the wildcards to work on those values with regular expressions. I've posted my revised code below:
Examples of the file names are:
BCW_Daily SDP Yield.rpt2020-02-17***-09-02-32***.csv
hbc_platelet_daily_02102020.csv
MBC_ROLLING_YIELD_02172020.CSV
IBC_SDP_Rolling_7Days_021720.CSV
Any help is appreciated.
import datetime
import ftplib
import os
ftpdir =('/home/hospserv/inbound/platelet/')
savedir = "C:/FTP/"
archivedir = "C:/ftparchive/"
os.chdir(savedir)
today = datetime.date.today()
iltoday = datetime.date.today() - datetime.timedelta(days=7)
widate = (f"{today:%Y-%m-%d}")
ildate = (f"{iltoday:%m%d%Y}")
midate = (f"{today:%m%d%Y}")
indate = (f"{today:%m%d%y}")
filenameIN = ('IBC_SDP_Rolling_7Days_'+indate+'.CSV')
filenameWI = ('BCW_SDP_Rolling_7Days.rpt'***+widate+'*'+***'.csv')
filenameIL = ('hbc_platelet_daily_'+ildate+'.csv')
filenameMI = ('MBC_ROLLING_YIELD_'+midate+'.CSV')
dlfiles = [filenameMI,filenameIN,filenameWI,filenameIL]
connection = ftplib.FTP(host='xxx',user='xxx',passwd='xxx')
welcome = ftplib.FTP.getwelcome(connection)
print(welcome)
connection.cwd(ftpdir)
ftp_list = connection.nlst()
print(ftp_list)
for x in dlfiles:
if x in ftp_list:
connection.retrbinary("RETR "+x, open(os.path.join(savedir, x), 'wb').write)
else:
print(x+' fail')
connection.quit()
Solved it:
# import modules
import fnmatch
import datetime
import ftplib
import os
#define variables
ftpdir =('/home/hospserv/inbound/platelet/')
savedir = "C:/FTP/"
archivedir = "C:/ftparchive/"
filedir = "C:/DailyData/SDPS/"
os.chdir(savedir)
today = datetime.date.today()
iltoday = datetime.date.today() - datetime.timedelta(days=7)
widate = (f"{today:%Y-%m-%d}")
ildate = (f"{iltoday:%m%d%Y}")
midate = (f"{today:%m%d%Y}")
indate = (f"{today:%m%d%y}")
filenameIN = ('IBC_SDP_Rolling_7Days_'+indate+'.CSV')
pattern = ('BCW_SDP_Rolling_7Days.rpt'+widate+'*'+'.csv')
filenameIL = ('hbc_platelet_daily_'+ildate+'.csv')
filenameMI = ('MBC_ROLLING_YIELD_'+midate+'.CSV')
#create FTP connection
connection = ftplib.FTP(xxxxxxx)
connection.cwd(ftpdir)
#generate file list on FTP
ftp_list = connection.nlst()
#create wildcard string for WI file
wistring = fnmatch.filter(ftp_list,pattern)
filenameWI = str(wistring[0])
dlfiles = [filenameMI,filenameIN,filenameIL,filenameWI]
#download files from FTP to local
for x in dlfiles:
if x in ftp_list:
connection.retrbinary("RETR "+x, open(os.path.join(savedir, x), 'wb').write)
connection.quit()
I am running a code to download data and saving them in local drive. However, I am getting above mentioned error message. Please note initially I have converted date in a different format and while saving them I get this error message.
Can you please help me with this error?
'''
import quandl
import os
import pandas as pd
import datetime as dt
import glob
if name == "main":
'Creating bucket to store missing data file.'
data_missing = []
New_date = []
'Defining a path to save CSV files after downloading and also deleting all csv file at one go.'
extension = 'csv'
path = "F:/Tradepoint/MyMkt/"
if not os.path.exists(path):
os.mkdir(path)
os.chdir(path)
csv_count = [forma for forma in glob.glob('*.{}'.format(extension))]
for csv_coun in range(len(csv_count)):
os.remove(r"F:/Tradepoint/MyMkt/" + csv_count[csv_coun][0:])
'Setting up quandl configuration, reading ticker list, setting up date for which data is going to get downloaded'
quandl.ApiConfig.api_key = 'Hba3CzgNnEa2LMxR14FA'
end_date = dt.date.today()
diff_year = dt.timedelta(days=3650)
start_date = end_date - diff_year
stock_list = pd.read_csv(r"F:\Abhay_New\Abhay\Python\Project\SHARADAR_SF1.csv")
'Looping through quandl website to download data and renaming them as per requirement.'
for stock_lis in range(len(stock_list)):
data = quandl.get_table('SHARADAR/SEP', date={'gte':start_date, 'lte':end_date}, ticker=stock_list.iloc[stock_lis])
sort_by_date = data.sort_values('date')
for sort_by_dat in range(len(sort_by_date['date'])):
Date = dt.date.strftime(sort_by_date['date'][sort_by_dat],'%d-%m-%Y')
New_date.append(Date)
if len(data)>1:
Date = pd.Series(New_date).rename('Date').astype(str)
OPEN = sort_by_date['open']
HIGH = sort_by_date['high']
LOW = sort_by_date['low']
CLOSE = sort_by_date['close']
VOLUME = sort_by_date['volume']
final_data = pd.concat([Date,OPEN,HIGH,LOW,CLOSE,VOLUME],axis=1)
stk = stock_list.iloc[sort_by_dat][0]
final_data.to_csv(str(path + stk + '.csv'), sep=',', index = False, header = False)
else:
data_missing.append(stock_list.iloc[sort_by_dat])
print(data_missing)
'''
Thanks,
Abhay Dodiya
The index of both for loops is i. This causes potentially unintended behavior:
for i in range(2):
for i in range(3,11):
pass
print(i)
gives
10
10
So even after exiting the second loop, the last value from i is there. Rename the counting variable in that loop and your issue should be gone.
In your case you probably have more dates than stocks, and thus observe the error message you have.
I'm a hobby coder started with AHK, then some java and now I try to learn Python. I have searched and found some tips but I have yet not been able to implement it into my own code.
Hopefully someone here can help me, it's a very short program.
I'm using .txt csv database with ";" as a separator.
DATABASE EXAMPLE:
Which color is normally a cat?;Black
How tall was the longest man on earth?;272 cm
Is the earth round?;Yes
The database now consists of 20.000 lines which makes the program "to slow", only using 25% CPU (1 core).
If I can make it use all 4 cores (100%) I guess it would perform the task alot faster. The task is basically to compare the CLIPBOARD with the database and if there is a match, it should give me an answer as a return. Perhaps also I can separate the database into 4 pieces?
The code right now looks like this! Not more then 65 lines and its doing its job (but to slow). Advice on how I can make this process into multi core needed.
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
return db
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
def top_answers(db, question):
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def main():
try:
db = load_db()
last_db_reload = time.time()
while True:
# Get contents of clipboard
question = pp.paste()
# Rank answer
top = top_answers(db, question)
# If answer was found, show results
if len(top) > 0:
write_txt(top)
time.sleep(fall_back_time)
except:
print("Error in main(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
if name == 'main':
main()'
If you could divide the db into four equally large you could process them in parallel like this:
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
import threading
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def worker(thread_id, question):
thread_id = str(thread_id)
db = pd.read_csv(db_file_path + thread_id, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
top = db_sorted
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar" + thread_id + ".txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
return
def main():
question = pp.paste()
for i in range(1, 4):
t = threading.Thread(target=worker, args=(i, question))
t.start()
t.join()
if name == 'main':
main()
The solution with multiprocessing:
import time
import pyperclip as pp
import pandas as pd
#import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy as np
# pathos uses better pickle to tranfer more complicated objects
from pathos.multiprocessing import Pool
from functools import reduce
import sys
import os
from contextlib import closing
ratio_threshold = 70
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
chunked_db = []
NUM_PROCESSES = os.cpu_count()
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db.columns = ['question', 'answer']
#db = db.drop_duplicates() # i drop it for experiment
break
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
# split database into equal chunks:
# (if you have a lot of RAM, otherwise you
# need to compute ranges in db, something like
# chunk_size = len(db)//NUM_PROCESSES
# ranges[i] = (i*chunk_size, (i+1)*cjunk_size)
# and pass ranges in original db to processes
chunked_db = np.split(db, [NUM_PROCESSES], axis=0)
return chunked_db
def top_answers_multiprocessed(question, chunked_db):
# on unix, python uses 'fork' mode by default
# so the process has 'copy-on-change' access to all global variables
# i.e. if process will change something in db, it will be copied to it
# with a lot of overhead
# Unfortunately, I'fe heard that on Windows only 'spawn' mode with full
# copy of everything is used
# Process pipeline uses pickle, it's quite slow.
# so on small database you may not have benefit from multiprocessing
# If you are going to transfer big objects in or out, look
# in the direction of multiprocessing.Array
# this solution is not fully efficient,
# as pool is recreated each time
# You can create daemon processes which will monitor
# Queue for incoming questions, but it's harder to implement
def top_answers(idx):
# question is in the scope of parent function,
chunked_db[idx]['ratio'] = chunked_db[idx]['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = chunked_db[idx].sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
with closing(Pool(processes=NUM_PROCESSES)) as pool:
# chunked_db is a list of databases
# they are in global scope, we send only index beacause
# all the data set is pickled
num_chunks = len(chunked_db)
# apply function top_answers across generator range(num_chunks)
res = pool.imap_unordered(top_answers, range(num_chunks))
res = list(res)
# now res is list of dataframes, let's join it
res_final = reduce(lambda left,right: pd.merge(left,right,on='ratio'), res)
return res_final
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def mainfunc():
global chunked_db
chunked_db = load_db()
last_db_reload = time.time()
print('db loaded')
last_clip = ""
while True:
# Get contents of clipboard
try:
new_clip = pp.paste()
except:
continue
if (new_clip != last_clip) and (len(new_clip)> 0):
print(new_clip)
last_clip = new_clip
question = new_clip.strip()
else:
continue
# Rank answer
top = top_answers_multiprocessed(question, chunked_db)
# If answer was found, show results
if len(top) > 0:
#write_txt(top)
print(top)
if __name__ == '__main__':
mainfunc()