Preserving Data Groupings when writing to Worksheet using OpenPyxl

Preserving Data Groupings when writing to Worksheet using OpenPyxl - python

Disclaimer: I am a beginner-level, self-taught casual 'programmer' with Python.
Background: I have a script that takes Groupings of data from an .xlsx Workbook and writes them to separate Worksheets of a seperate .xlsx like so -
Before state (original_data.xlsx): you will note each top-level is a Client, underneath sits multiple levels of underyling data, pertaining to it's parent client.
After state (split_data.xlsx): you will note each client and their underlying data is written to an identically named Worksheet in a new .xlsx file like so:
Issue: you will note from the After state that the Groupings / Levels have been lost, whilst all the data is present and has different levels of indentation.
Does anyone know how I might enhance my script to ensure that the Groupings / Levels are preserved? E.g., so you can still expand each Grouping (e.g., Client A) like so:
My script: here is the script (sorry it's messy!) which achieves the above, minus the preserved Grouping/Levels.
import openpyxl
from copy import copy
from openpyxl import load_workbook
columns=['A','B','C','D','E','F','G','H','I','J','K','L']
def copy_cell(ws, row,ws_row,ws1):
for col in columns:
ws_cell=ws1[col+str(ws_row)]
new_cell = ws[col+str(row)]
if ws_cell.has_style:
new_cell.font = copy(ws_cell.font)
new_cell.border = copy(ws_cell.border)
new_cell.fill = copy(ws_cell.fill)
new_cell.number_format = copy(ws_cell.number_format)
new_cell.protection = copy(ws_cell.protection)
new_cell.alignment = copy(ws_cell.alignment)
wb1 = openpyxl.load_workbook('original_data.xlsx')
ws1=wb1.active
indexs=[]
clients=[]
index=1
while ws1['A'+str(index)]:
if str(ws1['A'+str(index)].alignment.indent)=='0.0':
indexs.append(index)
clients.append(ws1['A'+str(index)].value)
if ws1['A'+str(index)].value is None:
indexs.append(index)
break
index+=1
wb1.close()
wb = openpyxl.Workbook()
ws=wb.active
start_index=1
headers=['Ownership Structure', 'Fee Schedule', 'Management Style', 'Advisory Firm', 'Inception Date', 'Days in Time Period', 'Adjusted Average Daily Balance (No Div, USD)', 'Assets Billed On (USD)',
'Effective Billing Rate', 'Billing Fees (USD)', 'Bill To Account', 'Model Type']
for y,index in enumerate(indexs):
try:
client=0
if len(clients[y])>=32:
client=clients[y][:31]
else:
client=clients[y]
wb.create_sheet(client)
ws=wb[client]
ws.column_dimensions['A'].width=35
ws.append(headers)
row_index=2
for i in range(start_index,indexs[y+1]):
ws.append([ws1[col+str(i)].value for col in columns])
copy_cell(ws,row_index,i,ws1)
row_index+=1
start_index=indexs[y+1]
except:
pass
wb.save('split_data.xlsx')
wb.close()
try:
wb1 = openpyxl.load_workbook('split_data.xlsx')
a=wb1['Sheet']
wb1.remove(a)
a=wb1['Sheet1']
wb1.remove(a)
wb1.save('split_data.xlsx')
wb1.close()
except:
pass
Resources: here is a link to some test data (original_data.xlsx)

from openpyxl import load_workbook
def get_client_rows(sheet):
"""Get client rows.
Skip header and then look for row dimensions without outline level
"""
return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
return [
row_index
for row_index, row_dimension in sheet.row_dimensions.items()
if row_index > 1 and row_dimension.outline_level == 0
]
def delete_client_block(sheet, start, end):
"""
Delete rows starting from up to and including end.
"""
for row in range(start, end + 1):
sheet.row_dimensions.pop(row, None)
sheet.delete_rows(start, end - start + 1)
def split_workbook(input_file, output_file):
"""
Split workbook each main group into its own sheet.
Not too loose any formatting we copy the current sheet and remove all rows
which do not belong to extacted group.
"""
try:
workbook = load_workbook(input_file)
data_sheet = workbook.active
client_rows = get_client_rows(data_sheet)
for index, client_row in enumerate(client_rows):
# create new sheet for given client, shorten client as it might be too long
client_sheet = workbook.copy_worksheet(data_sheet)
client_sheet.title = data_sheet.cell(client_row, 1).value[:32]
# delete rows after current client if available
if index < len(client_rows) - 1:
row_after_client = client_rows[index + 1]
delete_client_block(
client_sheet, row_after_client, client_sheet.max_row
)
# delete rows before current client if available
if index > 0:
first_client_row = client_rows[0]
delete_client_block(
client_sheet, first_client_row, client_row - first_client_row + 1
)
# move left over dimensions to top of the sheet
for row_index in list(client_sheet.row_dimensions.keys()):
# skip header row dimension
if row_index > first_client_row - 1:
row_dimension = client_sheet.row_dimensions.pop(row_index)
new_index = row_index - client_row + first_client_row
row_dimension.index = new_index
client_sheet.row_dimensions[new_index] = row_dimension
del workbook[data_sheet.title]
workbook.save(output_file)
finally:
workbook.close()
if __name__ == "__main__":
input_file = 'data.xlsx'
output_file = "data_output.xlsx"
split_workbook(input_file, output_file)

Related

Insert value in cells with spinners in MS Project (.mpp file) using Python

I am trying to insert data from excel file into project file i.e .mpp file. Following code works fine while putting data into simple cells (example cost column) in .mpp file.
import openpyxl
import win32com.client
file = 'PATH_TO_PROJECT_FILE'
project_app = win32com.client.Dispatch('MSProject.Application')
project_app.Visible = 1
project_app.FileOpen(file)
project = project_app.ActiveProject
project_tasks = project.Tasks
file_path = 'PATH_TO_EXCEL_FILE'
file = openpyxl.load_workbook(file_path,keep_vba=True)
sheet3 = file['OutputData']
for cell,task in zip(sheet3['D'][1:],project_tasks):
if '=' in str(cell.value):
break
task.Cost = cell.value
But in .mpp file, cells of duration column have spinners/spin buttons to set number of days. see picture below:
If I use the same code as above to set/insert value of duration using python, it does not change the value, moreover it does not throw any exception and the program exit in normal fashion
Same is the case with start date in project. see the picture below:
A solution came to my mind but it does not seem feasible. If I create a custom column and insert duration and start date into it. But the draw back of this solution is that the visuals in MS Project files are dependent on previously default columns.
How can I set/insert values into these type of cells? Or if there is anything I can do in MS Project? Or there's any solution using vba?

You can set task.Start and task.Duration just as you can set task.Cost. The fact that the MS Project UI has spinners for those fields is not relevant. Try a simple example without pulling data from Excel to start. Then validate that the data in Excel is in the proper format.
import win32com.client
file = 'PATH_TO_PROJECT_FILE'
project_app = win32com.client.Dispatch('MSProject.Application')
project_app.Visible = 1
project_app.FileOpen(file)
project = project_app.ActiveProject
project_tasks = project.Tasks
task = project_tasks(1) #pick a non-summary task
task.Cost = 100
task.Duration = "10 days"
task.Start = "10/01/22"
project_app.FileSave()
project_app.Quit(True)
BTW: On Summary tasks, Duration and Start are calculated fields and are therefore read-only.

Junaid, I'm not familiar with Python so I can't help with that but what you want to do is very possible with VBA. Here is a macro I wrote a few years ago that should help you get started in the right direction.
John
'Macro written by John - Project
'Version 1.0 9/25/15 11:00 AM
Option Explicit
Option Compare Text
Public Const ver = " - 1.0"
Public xl As Excel.Application
Public WB As Excel.Workbook
Public S As Excel.Worksheet
Public c As Excel.Range
Public Tsks As Tasks
Public UID As Single
Public SeedDt As Date
Public DurVal As Single, HPD As Single, HPW As Single, cf As Single
Public numrows As Integer, i As Integer, p1 As Integer
Public curcel As Variant 'could be either a number or text
Sub ImportExcelDataToProject()
MsgBox "This macro imports the following data fields from Excel:" & vbCr & _
" Task Name" & vbCr & " Outline Level" & vbCr & _
" Duration" & vbCr & " Start (if necessary)" & vbCr & _
" Predecessors" & vbCr & " Resource Names" & vbCr & _
" Task Notes", vbInformation, "Import from Excel" & ver
'Open the Excel workbook to gather data
' Note: Excel need not be running
Set WB = Workbooks.Open(FileName:="C:\Users\John\Desktop\ExcelToProjectVBAImportX.xlsx")
Set S = WB.Worksheets(1)
'Create new Project file to receive imported data
FileNew
'----------------------
'Gather some basic parameters from Excel and Project
' Find earliest start date used in Excel workbook
sort1
' Find out how many rows of data in Excel worksheet
' (assumes first row is header, if there is none remove the "-1")
numrows = WB.Worksheets(1).UsedRange.Rows.Count - 1
' Find the default hours per day and hours per week settings for Project
HPD = ActiveProject.HoursPerDay
HPW = ActiveProject.HoursPerWeek
'-----------------------
'Read each row of data from the worksheet and create tasks in Project
Application.Caption = "Progress"
ActiveWindow.Caption = " Reading worksheet and exporting"
Set c = S.Range("B2") 'set reference to first column of data to be imported
Set Tsks = ActiveProject.Tasks
For i = 0 To numrows - 1
Tsks.Add.Name = c.Offset(i, 0).Value
'find the unique ID of the task just added
' since tasks are added in sequence, the count property identifies the current task
' (having the Unique ID facilitates expansion of the macro for increased functionality)
UID = Tsks(Tsks.Count).UniqueID
Tsks.UniqueID(UID).OutlineLevel = c.Offset(i, 1).Value
'skip remaining columns for this row if this is destined to be a summary line in Project
' (Project calculates duration and start and best practices dictate no resources assigned)
If c.Offset(i, 2).Value <> "" Then
'resolve units used in duration column of Excel worksheet
DecodeXLDurUnits
Tsks.UniqueID(UID).Duration = DurVal
Tsks.UniqueID(UID).Predecessors = c.Offset(i, 3).Value
'if no predecessors exist for this task AND it starts after the Project Start Date
' then set start date. Note: this will set a start-no-earlier-than (SNET) constraint
If Tsks.UniqueID(UID).Predecessors = "" And CStr(c.Offset(i, 4).Value) > SeedDt Then
Tsks.UniqueID(UID).Start = CStr(c.Offset(i, 4).Value)
End If
Tsks.UniqueID(UID).ResourceNames = c.Offset(i, 5).Value
End If
Tsks.UniqueID(UID).Notes = c.Offset(i, 6).Value
Next i
'------------------------
'Finally, close and exit
MsgBox "Data Import is complete", vbOKOnly, "Import from Excel"
Application.Caption = ""
ActiveWindow.Caption = ""
WB.Close savechanges:=False
End Sub
'This routine determines if duration column in Excel is in minutes, hours, days or weeks
' (most likely units) and then adjusts the data accordingly for import to Project
Sub DecodeXLDurUnits()
curcel = c.Offset(i, 2).Value
'default if duration column is in minutes
p1 = Len(CStr(curcel)) + 1
cf = 1
If InStr(curcel, "h") > 0 Then
p1 = InStr(curcel, "h")
cf = 60
ElseIf InStr(curcel, "d") > 0 Then
p1 = InStr(curcel, "d")
cf = HPD * 60
ElseIf InStr(curcel, "w") > 0 Then
p1 = InStr(curcel, "w")
cf = HPW * 60
End If
'convert duration value to be in minutes for Project import
DurVal = CSng(Mid(curcel, 1, p1 - 1)) * cf
End Sub
'This routine examines the pre-formatted Excel Workbook Start column and finds the
' earliest date. This is then used to set the Project Start Date
Sub sort1()
Dim Cnt As Integer
numrows = S.UsedRange.Rows.Count
SeedDt = "12/31/2049" 'maintain compatibility with Pre-Project 2013 versions
Set c = S.Range("F2")
For i = 0 To numrows - 1
If c.Offset(i, 0).Value <> "" And c.Offset(i, 0).Value < SeedDt Then SeedDt = c.Offset(i, 0).Value
Next i
ActiveProject.ProjectStart = SeedDt
End Sub

keep calling an API until it is updated with latest item (Python)

I'm looking to call an API, and compare the data to my saved data in a CSV. If it has a new data point then I want to update my CSV and return the DataFrame... The mystery I have is why these two variables appear to be the same, yet the If statement moves to the Else instead of recognizing they are the same, if they are the same it should keep looping until an updated data point appears,(see second_cell == lastItem1 )
import pandas_datareader as pdr # https://medium.com/swlh/pandas-datareader-federal-reserve-economic-data-fred-a360c5795013
import datetime
def datagetter():
i = 1
while i < 120:
start = datetime.datetime (2005, 1, 1) ### Step 1: get data, and print last item
end = datetime.datetime (2040, 1, 1)
df = pdr.DataReader('PAYEMS', 'fred', start, end) ## This is the API
lastItem1 = df["PAYEMS"].iloc[-1] # find the last item in the data we have just downloaded
print ("Latest item from Fred API: " , lastItem1) ### Print the last item
with open('PAYEMS.csv', 'r') as logs: # So first we open the most recent CSV file
data = logs.readlines()
last_row = data[-1].split(',') # split is default on , as CSVs should be.
second_cell = last_row[1] # "second_cell" is our variable name for the saved datapoint from last month/week/day
print ("Last Item, in thousands" , second_cell)
if second_cell == lastItem1:
print ("CSV " , second_cell, "API ", lastItem1, " downloaded and stored items are the same, will re-loop until a new datapoint")
print("attempt no.", i)
i += 1
else:
df.to_csv("PAYEMS.csv")
print ("returning dataframe")
# print(df.tail())
return df
df = datagetter()
print(df.tail(3))

solved my own problem:
my CSV was returning a string, and the API an int... not quite sure why.
So
if second_cell == "": second_cell = 0 second_cell1 = int(float(second_cell))

Alphabetically ordering Worksheets using openpxl in WriteOnlyWorkSheet mode

Background:I have created a script that takes a .xlsx file and splits the Groupings of data into separate worksheets, based on the names of each Grouping. E.g., Client A Grouped data will be written to a worksheet named Client A and so forth.Additionally, the script strips out Excel-invalid characters, and miscellaneous strings contained within INVALID_TITLE_CHARS and INVALID_TITLE_NAMES as well as uses values in management_style and model_type to decide which rows are irrelevant and therefore skipped, during the copy process.
My issue:I would like to expand the script so that the Worksheets are alphabetical. E.g., 'Client A, Client B, Client C, and so forth. I have tried to achieve this by client_sheet._sheets.sort(key=lambda output_workbook: output_workbook.title) however, for memory consistency reasons, I have used WriteOnlyWorksheet which seems incompatible with _sheets.Does anyone know if there is another solution? I was hoping to avoid creating a function that reopens the.xlsx
The script:This is the full script:
import time
import logging
import sys
import datetime as dt
import datetime
import requests as requests
import json
import enlighten
import warnings
from openpyxl import load_workbook
from openpyxl import LXML
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
from copy import copy
from configparser import ConfigParser
from requests.auth import HTTPBasicAuth
logger = logging.getLogger()
timestr = datetime.datetime.now().strftime("%Y-%m-%d")
warnings.filterwarnings("ignore")
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
INVALID_TITLE_NAMES = ["zz_ FeeRelationship", "Family"]
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
for name in INVALID_TITLE_NAMES:
title = title.replace(name, "")
return title[:31]
def is_client_row(row, row_dimension):
return row_dimension.outlineLevel == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# target_cell.border = copy(source_cell.border)
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def skip_row(row, row_dimension):
"""
Determine whether a row needs to be skipped and not copied to new workbook
"""
def get_column_value(column):
value = row[column].value or ""
return value.strip()
# skip total line
if row[0].value == "Total":
return True
management_style = [
"Advisory",
"Advisory - No Fee",
"Holding",
"JPAS",
"Liquidity Management",
"Trading",
"",
]
model_type = ["Client", "Holding Account", "Holding Company", "Trust", ""]
management_value = get_column_value(3)
model_value = get_column_value(11)
# Pass on either column
return management_value not in management_style and model_value not in model_type
# # Pass on both columns
# return management_value not in management_style or model_value not in model_type
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
skip_child = False
skipped_parent_outline_level = 0
skipped_rows_per_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
row_dimension = data_sheet.row_dimensions[index]
# verify whether current row is a child of skipped parent
if skip_child and skipped_parent_outline_level < row_dimension.outlineLevel:
skipped_rows_per_client += 1
continue
# reset skip_child when current row is not a child of skipped parent anymore
if (
skip_child
and skipped_parent_outline_level >= row_dimension.outlineLevel
):
skip_child = False
# check whether row needs to be skipped
if skip_row(row, row_dimension):
skipped_rows_per_client += 1
skip_child = True
skipped_parent_outline_level = row_dimension.outlineLevel
continue
# create new sheet found new client is found
if is_client_row(row, row_dimension):
skipped_rows_per_client = 0
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
new_row_index = index - skipped_rows_per_client - client_row_index + 2
client_sheet.row_dimensions[new_row_index] = copy(row_dimension)
client_sheet.row_dimensions[new_row_index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
start = time.time()
configure_logging()
input_file = 'Input_File'+timestr+'.xlsx'
output_file = 'Output_File'+timestr+'.xlsx'
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))

I am able to achieve this using _sheets.sort() the titles. Here is an example to demonstrate the same...
import openpyxl
from openpyxl.cell import WriteOnlyCell
#Create workbook in write_only mode as you did
output_workbook = Workbook(write_only=True)
#Add 4 sheets
ws1 = output_workbook.create_sheet("clientB_sheet_title")
ws2 = output_workbook.create_sheet("clientA_sheet_title")
ws3 = output_workbook.create_sheet("clientD_sheet_title")
ws4 = output_workbook.create_sheet("clientC_sheet_title")
#Add some data at start
cell = WriteOnlyCell(ws1, value="hello clientB")
ws1.append([cell, 3.14, None])
cell = WriteOnlyCell(ws2, value="hello clientA")
ws2.append([cell, 3.14, None])
cell = WriteOnlyCell(ws3, value="hello clientD")
ws3.append([cell, 3.14, None])
cell = WriteOnlyCell(ws4, value="hello clientC")
ws4.append([cell, 3.14, None])
### The key to your question - The sorting of titles ###
output_workbook._sheets.sort(key=lambda ws: ws.title)
#Finally save
output_workbook.save("output_file.xlsx")
Output excel
OPTION - 2 (better and safer option) using move_sheet()
As per Charlie Clark's recommendation, the better and safer option is to use move_sheets(). The same has been added. Note that I am only including the code that will replace the last two commands (_sheet and save). Results are the same...
asc_sheetlist = output_workbook.sheetnames
asc_sheetlist.sort()
for pos, name in enumerate(asc_sheetlist):
output_workbook.move_sheet(name, pos - output_workbook.sheetnames.index(name))
output_workbook.save("output_file.xlsx")

Based on Redox's answer, I was able to throw together a super-simple function to achieve this -
def alphabetical_client_sheet_sort(output_file):
workbook = load_workbook(output_file)
workbook._sheets.sort(key=lambda output_file: output_file.title)
workbook.save(output_file)
workbook.close()

How to set Excel PageBreak with xlwings in python?

Need help with setting PageBreak excel file with xlwings package in python.
According to Microsoft website: https://learn.microsoft.com/zh-tw/office/vba/api/excel.range.pagebreak
I've tried
app = xw.App(visible=True, add_book=False)
wb = app.books.open("raw_data/" + raw_file_name, update_links=False)
sht = wb.sheets['sheet1']
sht.api.Rows(24).PageBreak = 'xlPageBreakManual' # I would like to set on row 24
and the program stuck forever. Does anyone know how to solve the problem?
Thanks

xlwings does not know what xlPageBreakManual is unless its referenced. Nevertheless in this case it's not necessary. The default is to add a manual Page Break therefore syntax is only; sht.api.Rows(24).PageBreak = True. If you then check the PageBreak status on row 24; print(sht.api.Rows(24).PageBreak) it should return -4135 which is the excel constant for xlPageBreakManual.
You can use the constants to set the type of PageBreak, e.g. the line sht.api.Rows(24).PageBreak = True could also be written as sht.api.Rows(24).PageBreak = -4135 or sht.api.Rows(24).PageBreak = PageBreak.xlPageBreakManual
The page break values can be referenced from the xlwings constants using the syntax from xlwings.constants import PageBreak. My example shown below sets the page break at row 24 then shows the status of the previous, actual and ensuing rows. The previous and ensuing rows have a status of -4142 i.e. no page break while row 24 is -4135.
The code then removes the page break using xlPageBreakNone and the three rows 23-25 are all -4142 again.
For this test I used xlwings v0.26.1
import xlwings as xw
from xlwings.constants import PageBreak
### Constant values for reference
# xlPageBreakAutomatic = -4105
# xlPageBreakManual = -4135
# xlPageBreakNone = -4142
app = xw.App(visible=True, add_book=False)
sht = xw.Book("book.xlsx").sheets['Sheet1']
print("Add Page break at row 24")
sht.api.Rows(24).PageBreak = True
# sht.api.Rows(24).PageBreak = -4135 # This does the same as the line above
# sht.api.Rows(24).PageBreak = PageBreak.xlPageBreakManual # As does this line
print('Row23: ' + str(sht.api.Rows(23).PageBreak))
print('Row24: ' + str(sht.api.Rows(24).PageBreak))
print('Row25: ' + str(sht.api.Rows(25).PageBreak))
print("\nDelete Page break at row 24")
sht.api.Rows(24).PageBreak = PageBreak.xlPageBreakNone
print('Row23: ' + str(sht.api.Rows(23).PageBreak))
print('Row24: ' + str(sht.api.Rows(24).PageBreak))
print('Row25: ' + str(sht.api.Rows(25).PageBreak))
Note attempting to manually set a page break to -4105 (xlPageBreakAutomatic) fails which I would expect.

Python: Trouble with Asyncio/Aiohttp Websocket client requests

Been trying to extract websocket information from Bitfinex websocket client service. Below is the code. The script works fine when I search for under 30 crypto pairs (ie. "p" or "PAIRS" has 30 elements) but if I try to go higher the script never gets to the "save_data" co-routine. Any ideas why this could be happening.
I modified the script from: "https://mmquant.net/replicating-orderbooks-from-websocket-stream-with-python-and-asyncio/", kudos to Mmquant for making the code available and giving an awesome script description.
import aiohttp
import asyncio
import ujson
from tabulate import tabulate
from copy import deepcopy
import pandas as pd
from openpyxl import load_workbook
import datetime
from datetime import datetime
import numpy as np
from collections import OrderedDict
from time import sleep
"""
Load the workbook to dump the API data as well as instruct it to not generate a new sheet.
The excel work book must:
1. Be of the type ".xlsx", only this because the load_workbook function was set to call a specific sheet with .xlsx format. This can be changed.
2. Must have the worksheets, "apidata" and "Test". This can also be adjusted below.
3. The excel workbooks name is "bitfinexws.xlsx". This can be changed below.
4. The excel spreadsheet is in the same folder as this script.
"""
book = load_workbook('bitfinexwsasync.xlsx') #.xlsx Excel spreadsheet that will be used for the placement and extracting of data.
apdat = book['Sheet1'] #Assign a variable to the sheet where the trade ratios will be put. This is case sensitive.
#The next 3 lines are critical to allow overwriting of data and not creating a new worksheet when using panda dataframes.
writer = pd.ExcelWriter('bitfinexwsasync.xlsx', engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
#Get a list of all the ratios and add the standard trade url: "https://api.bitfinex.com/v1/book/" before the ratios.
burl = 'https://api.bitfinex.com/v1/book/' #This is the standard url for retrieving trade ratios, the pair symbol must be added after this.
sym = pd.read_json('https://api.bitfinex.com/v1/symbols',orient='values') #This is a list of all the symbols on the Bitfinex website.
p=[]
p=[0]*len(sym)
for i in range(0,len(sym)):
p[i]=sym.loc[i,0]
p=tuple(p)
m=len(p) #Max number of trade ratios to extract for this script. Script cannot run the full set of 105 trade ratios, it will time-out.
p=p[0:m]
d=[]
e=[]
j=[]
"""
NOTE:
The script cannot run for the full 105 pairs, it timesout and becomes unresponsive.
By testig the stability it was found that calling 21 pairs per script at a refresh rate of 5seconds did not allow for any time-out problems.
"""
print('________________________________________________________________________________________________________')
print('')
print('Bitfinex Websocket Trading Orderbook Extraction - Asynchronous.')
print('There are a total of ', len(sym), ' trade ratios in this exchange.')
print('Only ',m,' trading pairs will be extracted by this script, namely:',p)
print('Process initiated at',datetime.now().strftime('%Y-%m-%d %H:%M:%S'),'.') #Tells me the date and time that the data extraction was intiated.
print('________________________________________________________________________________________________________')
print('')
# Pairs which generate orderbook for.
PAIRS = p
# If there is n pairs we need to subscribe to n websocket channels.
# This the subscription message template.
# For details about settings refer to https://bitfinex.readme.io/v2/reference#ws-public-order-books.
SUB_MESG = {
'event': 'subscribe',
'channel': 'book',
'freq': 'F0', #Adjust for real time
'len': '25',
'prec': 'P0'
# 'pair': <pair>
}
def build_book(res, pair):
""" Updates orderbook.
:param res: Orderbook update message.
:param pair: Updated pair.
"""
global orderbooks
# Filter out subscription status messages.
if res.data[0] == '[':
# String to json
data = ujson.loads(res.data)[1]
# Build orderbook
# Observe the structure of orderbook. The prices are keys for corresponding count and amount.
# Structuring data in this way significantly simplifies orderbook updates.
if len(data) > 10:
bids = {
str(level[0]): [str(level[1]), str(level[2])]
for level in data if level[2] > 0
}
asks = {
str(level[0]): [str(level[1]), str(level[2])[1:]]
for level in data if level[2] < 0
}
orderbooks[pair]['bids'] = bids
orderbooks[pair]['asks'] = asks
# Update orderbook and filter out heartbeat messages.
elif data[0] != 'h':
# Example update message structure [1765.2, 0, 1] where we have [price, count, amount].
# Update algorithm pseudocode from Bitfinex documentation:
# 1. - When count > 0 then you have to add or update the price level.
# 1.1- If amount > 0 then add/update bids.
# 1.2- If amount < 0 then add/update asks.
# 2. - When count = 0 then you have to delete the price level.
# 2.1- If amount = 1 then remove from bids
# 2.2- If amount = -1 then remove from asks
data = [str(data[0]), str(data[1]), str(data[2])]
if int(data[1]) > 0: # 1.
if float(data[2]) > 0: # 1.1
orderbooks[pair]['bids'].update({data[0]: [data[1], data[2]]})
elif float(data[2]) < 0: # 1.2
orderbooks[pair]['asks'].update({data[0]: [data[1], str(data[2])[1:]]})
elif data[1] == '0': # 2.
if data[2] == '1': # 2.1
if orderbooks[pair]['bids'].get(data[0]):
del orderbooks[pair]['bids'][data[0]]
elif data[2] == '-1': # 2.2
if orderbooks[pair]['asks'].get(data[0]):
del orderbooks[pair]['asks'][data[0]]
async def save_data():
""" Save the data to the excel spreadsheet specified """
#NOTE, Adjusted this for every 5 seconds, ie "await asyncio.sleep(10)" to "await asyncio.sleep(5)"
global orderbooks
while 1:
d=[]
e=[]
j=[]
await asyncio.sleep(5)
for pair in PAIRS:
bids2 = [[v[1], v[0], k] for k, v in orderbooks[pair]['bids'].items()]
asks2 = [[k, v[0], v[1]] for k, v in orderbooks[pair]['asks'].items()]
bids2.sort(key=lambda x: float(x[2]), reverse=True)
asks2.sort(key=lambda x: float(x[0]))
table2 = [[*bid, *ask] for (bid, ask) in zip(bids2, asks2)]
d.extend(table2)
e.extend([0]*len(table2))
e[len(e)-len(table2)]=pair
j.extend([0]*len(d))
j[0]=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
s = pd.DataFrame(d, columns=['bid:amount', 'bid:count', 'bid:price', 'ask:price', 'ask:count', 'ask:amount'])
r = pd.DataFrame(e, columns=['Trade pair'])
u = pd.DataFrame(j, columns=['Last updated'])
z = pd.concat([s, r, u], axis=1, join_axes=[s.index])
z.to_excel(writer, 'Sheet1', startrow=0, startcol=0, index=False)
writer.save()
print('Update completed at',datetime.now().strftime('%Y-%m-%d %H:%M:%S'),'.')
async def get_book(pair, session):
""" Subscribes for orderbook updates and fetches updates. """
#print('enter get_book, pair: {}'.format(pair))
pair_dict = deepcopy(SUB_MESG) #Allows for changes to a made within a variable.
pair_dict.update({'pair': pair}) #Updates the dictionary SUB_MESG with the new pair to be evaluated. Will be added to the end of the dictionary.
async with session.ws_connect('wss://api.bitfinex.com/ws/2') as ws:
asyncio.ensure_future(ws.send_json(pair_dict)) #This was added and replaced "ws.send_json(pair_dict)" as Ubuntu python required a link to asyncio for this function.
while 1: #Loops infinitely.
res = await ws.receive()
print(pair_dict['pair'], res.data) # debug
build_book(res, pair)
async def main():
""" Driver coroutine. """
async with aiohttp.ClientSession() as session:
coros = [get_book(pair, session) for pair in PAIRS]
# Append coroutine for printing orderbook snapshots every 10s.
coros.append(save_data())
await asyncio.wait(coros)
orderbooks = {
pair: {}
for pair in PAIRS
}
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Preserving Data Groupings when writing to Worksheet using OpenPyxl - python

Related

Insert value in cells with spinners in MS Project (.mpp file) using Python

keep calling an API until it is updated with latest item (Python)

Alphabetically ordering Worksheets using openpxl in WriteOnlyWorkSheet mode

How to set Excel PageBreak with xlwings in python?

Python: Trouble with Asyncio/Aiohttp Websocket client requests

Categories

Resources