Background:I have created a script that takes a .xlsx file and splits the Groupings of data into separate worksheets, based on the names of each Grouping. E.g., Client A Grouped data will be written to a worksheet named Client A and so forth.Additionally, the script strips out Excel-invalid characters, and miscellaneous strings contained within INVALID_TITLE_CHARS and INVALID_TITLE_NAMES as well as uses values in management_style and model_type to decide which rows are irrelevant and therefore skipped, during the copy process.
My issue:I would like to expand the script so that the Worksheets are alphabetical. E.g., 'Client A, Client B, Client C, and so forth. I have tried to achieve this by client_sheet._sheets.sort(key=lambda output_workbook: output_workbook.title) however, for memory consistency reasons, I have used WriteOnlyWorksheet which seems incompatible with _sheets.Does anyone know if there is another solution? I was hoping to avoid creating a function that reopens the.xlsx
The script:This is the full script:
import time
import logging
import sys
import datetime as dt
import datetime
import requests as requests
import json
import enlighten
import warnings
from openpyxl import load_workbook
from openpyxl import LXML
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
from copy import copy
from configparser import ConfigParser
from requests.auth import HTTPBasicAuth
logger = logging.getLogger()
timestr = datetime.datetime.now().strftime("%Y-%m-%d")
warnings.filterwarnings("ignore")
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
INVALID_TITLE_NAMES = ["zz_ FeeRelationship", "Family"]
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
for name in INVALID_TITLE_NAMES:
title = title.replace(name, "")
return title[:31]
def is_client_row(row, row_dimension):
return row_dimension.outlineLevel == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# target_cell.border = copy(source_cell.border)
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def skip_row(row, row_dimension):
"""
Determine whether a row needs to be skipped and not copied to new workbook
"""
def get_column_value(column):
value = row[column].value or ""
return value.strip()
# skip total line
if row[0].value == "Total":
return True
management_style = [
"Advisory",
"Advisory - No Fee",
"Holding",
"JPAS",
"Liquidity Management",
"Trading",
"",
]
model_type = ["Client", "Holding Account", "Holding Company", "Trust", ""]
management_value = get_column_value(3)
model_value = get_column_value(11)
# Pass on either column
return management_value not in management_style and model_value not in model_type
# # Pass on both columns
# return management_value not in management_style or model_value not in model_type
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
skip_child = False
skipped_parent_outline_level = 0
skipped_rows_per_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
row_dimension = data_sheet.row_dimensions[index]
# verify whether current row is a child of skipped parent
if skip_child and skipped_parent_outline_level < row_dimension.outlineLevel:
skipped_rows_per_client += 1
continue
# reset skip_child when current row is not a child of skipped parent anymore
if (
skip_child
and skipped_parent_outline_level >= row_dimension.outlineLevel
):
skip_child = False
# check whether row needs to be skipped
if skip_row(row, row_dimension):
skipped_rows_per_client += 1
skip_child = True
skipped_parent_outline_level = row_dimension.outlineLevel
continue
# create new sheet found new client is found
if is_client_row(row, row_dimension):
skipped_rows_per_client = 0
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
new_row_index = index - skipped_rows_per_client - client_row_index + 2
client_sheet.row_dimensions[new_row_index] = copy(row_dimension)
client_sheet.row_dimensions[new_row_index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
start = time.time()
configure_logging()
input_file = 'Input_File'+timestr+'.xlsx'
output_file = 'Output_File'+timestr+'.xlsx'
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))
I am able to achieve this using _sheets.sort() the titles. Here is an example to demonstrate the same...
import openpyxl
from openpyxl.cell import WriteOnlyCell
#Create workbook in write_only mode as you did
output_workbook = Workbook(write_only=True)
#Add 4 sheets
ws1 = output_workbook.create_sheet("clientB_sheet_title")
ws2 = output_workbook.create_sheet("clientA_sheet_title")
ws3 = output_workbook.create_sheet("clientD_sheet_title")
ws4 = output_workbook.create_sheet("clientC_sheet_title")
#Add some data at start
cell = WriteOnlyCell(ws1, value="hello clientB")
ws1.append([cell, 3.14, None])
cell = WriteOnlyCell(ws2, value="hello clientA")
ws2.append([cell, 3.14, None])
cell = WriteOnlyCell(ws3, value="hello clientD")
ws3.append([cell, 3.14, None])
cell = WriteOnlyCell(ws4, value="hello clientC")
ws4.append([cell, 3.14, None])
### The key to your question - The sorting of titles ###
output_workbook._sheets.sort(key=lambda ws: ws.title)
#Finally save
output_workbook.save("output_file.xlsx")
Output excel
OPTION - 2 (better and safer option) using move_sheet()
As per Charlie Clark's recommendation, the better and safer option is to use move_sheets(). The same has been added. Note that I am only including the code that will replace the last two commands (_sheet and save). Results are the same...
asc_sheetlist = output_workbook.sheetnames
asc_sheetlist.sort()
for pos, name in enumerate(asc_sheetlist):
output_workbook.move_sheet(name, pos - output_workbook.sheetnames.index(name))
output_workbook.save("output_file.xlsx")
Based on Redox's answer, I was able to throw together a super-simple function to achieve this -
def alphabetical_client_sheet_sort(output_file):
workbook = load_workbook(output_file)
workbook._sheets.sort(key=lambda output_file: output_file.title)
workbook.save(output_file)
workbook.close()
Related
Disclaimer: I am a beginner-level, self-taught casual 'programmer' with Python.
Background: I have a script that takes Groupings of data from an .xlsx Workbook and writes them to separate Worksheets of a seperate .xlsx like so -
Before state (original_data.xlsx): you will note each top-level is a Client, underneath sits multiple levels of underyling data, pertaining to it's parent client.
After state (split_data.xlsx): you will note each client and their underlying data is written to an identically named Worksheet in a new .xlsx file like so:
Issue: you will note from the After state that the Groupings / Levels have been lost, whilst all the data is present and has different levels of indentation.
Does anyone know how I might enhance my script to ensure that the Groupings / Levels are preserved? E.g., so you can still expand each Grouping (e.g., Client A) like so:
My script: here is the script (sorry it's messy!) which achieves the above, minus the preserved Grouping/Levels.
import openpyxl
from copy import copy
from openpyxl import load_workbook
columns=['A','B','C','D','E','F','G','H','I','J','K','L']
def copy_cell(ws, row,ws_row,ws1):
for col in columns:
ws_cell=ws1[col+str(ws_row)]
new_cell = ws[col+str(row)]
if ws_cell.has_style:
new_cell.font = copy(ws_cell.font)
new_cell.border = copy(ws_cell.border)
new_cell.fill = copy(ws_cell.fill)
new_cell.number_format = copy(ws_cell.number_format)
new_cell.protection = copy(ws_cell.protection)
new_cell.alignment = copy(ws_cell.alignment)
wb1 = openpyxl.load_workbook('original_data.xlsx')
ws1=wb1.active
indexs=[]
clients=[]
index=1
while ws1['A'+str(index)]:
if str(ws1['A'+str(index)].alignment.indent)=='0.0':
indexs.append(index)
clients.append(ws1['A'+str(index)].value)
if ws1['A'+str(index)].value is None:
indexs.append(index)
break
index+=1
wb1.close()
wb = openpyxl.Workbook()
ws=wb.active
start_index=1
headers=['Ownership Structure', 'Fee Schedule', 'Management Style', 'Advisory Firm', 'Inception Date', 'Days in Time Period', 'Adjusted Average Daily Balance (No Div, USD)', 'Assets Billed On (USD)',
'Effective Billing Rate', 'Billing Fees (USD)', 'Bill To Account', 'Model Type']
for y,index in enumerate(indexs):
try:
client=0
if len(clients[y])>=32:
client=clients[y][:31]
else:
client=clients[y]
wb.create_sheet(client)
ws=wb[client]
ws.column_dimensions['A'].width=35
ws.append(headers)
row_index=2
for i in range(start_index,indexs[y+1]):
ws.append([ws1[col+str(i)].value for col in columns])
copy_cell(ws,row_index,i,ws1)
row_index+=1
start_index=indexs[y+1]
except:
pass
wb.save('split_data.xlsx')
wb.close()
try:
wb1 = openpyxl.load_workbook('split_data.xlsx')
a=wb1['Sheet']
wb1.remove(a)
a=wb1['Sheet1']
wb1.remove(a)
wb1.save('split_data.xlsx')
wb1.close()
except:
pass
Resources: here is a link to some test data (original_data.xlsx)
from openpyxl import load_workbook
def get_client_rows(sheet):
"""Get client rows.
Skip header and then look for row dimensions without outline level
"""
return [row[0].row for row in sheet.iter_rows(2) if row[0].alignment.indent == 0.0]
return [
row_index
for row_index, row_dimension in sheet.row_dimensions.items()
if row_index > 1 and row_dimension.outline_level == 0
]
def delete_client_block(sheet, start, end):
"""
Delete rows starting from up to and including end.
"""
for row in range(start, end + 1):
sheet.row_dimensions.pop(row, None)
sheet.delete_rows(start, end - start + 1)
def split_workbook(input_file, output_file):
"""
Split workbook each main group into its own sheet.
Not too loose any formatting we copy the current sheet and remove all rows
which do not belong to extacted group.
"""
try:
workbook = load_workbook(input_file)
data_sheet = workbook.active
client_rows = get_client_rows(data_sheet)
for index, client_row in enumerate(client_rows):
# create new sheet for given client, shorten client as it might be too long
client_sheet = workbook.copy_worksheet(data_sheet)
client_sheet.title = data_sheet.cell(client_row, 1).value[:32]
# delete rows after current client if available
if index < len(client_rows) - 1:
row_after_client = client_rows[index + 1]
delete_client_block(
client_sheet, row_after_client, client_sheet.max_row
)
# delete rows before current client if available
if index > 0:
first_client_row = client_rows[0]
delete_client_block(
client_sheet, first_client_row, client_row - first_client_row + 1
)
# move left over dimensions to top of the sheet
for row_index in list(client_sheet.row_dimensions.keys()):
# skip header row dimension
if row_index > first_client_row - 1:
row_dimension = client_sheet.row_dimensions.pop(row_index)
new_index = row_index - client_row + first_client_row
row_dimension.index = new_index
client_sheet.row_dimensions[new_index] = row_dimension
del workbook[data_sheet.title]
workbook.save(output_file)
finally:
workbook.close()
if __name__ == "__main__":
input_file = 'data.xlsx'
output_file = "data_output.xlsx"
split_workbook(input_file, output_file)
My goal is to use a list of tickers together with the TWS API to extract parts of the company snapshot (reqFundamentalData() -> "ReportSnapshot") and the financial statements (reqFundamentalData() -> "ReportsFinStatements") of these tickers, convert into a dataframe and store it as a parquet file.
I tried to merge solutions provided:
use a list of tickers
TWS API to download stock fundamental data runs only the first data entry and ignores the others. Whow to solve this?
store XML as dataframe
Converting XML to Pandas
store data
Save data from TWS API to csv file
Code:
from datetime import datetime
from bs4 import BeautifulSoup as bs
import pandas as pd
from ibapi.client import EClient
from ibapi.contract import Contract
from ibapi.wrapper import EWrapper
import logging
import random
import pathlib
import time
from datetime import date
import datetime
from pathlib import Path
class TestApp(EWrapper, EClient):
def __init__(self, addr, port, client_id):
EWrapper.__init__(self) # new - book
EClient.__init__(self, self)
self.firstReqId = 8001
self.contracts = {} # keep in dict so you can lookup
self.contNumber = self.firstReqId
# add dataframes to store the result
self.df_company_info = pd.DataFrame(data=None, index=None, columns=None)
self.df_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
def addContracts(self, cont):
self.contracts[self.contNumber] = cont # add to dict using 8001 first time
self.contNumber += 1 # next id will be 8002 etc.
def nextValidId(self, orderId: int):
# now you are connected, ask for data, no need for sleeps
# this isn't the only way to know the api is started but it's what IB recommends
self.contNumber = self.firstReqId # start with first reqId
self.getNextData()
def error(self, reqId, errorCode, errorString):
print("Error: ", reqId, "", errorCode, "", errorString)
# if there was an error in one of your requests, just contimue with next id
if reqId > 0 and self.contracts.get(self.contNumber):
# err in reqFundametalData based on reqid existing in map
print('err in', self.contracts[reqId].symbol)
self.getNextData() # try next one
def fundamentalData(self, reqId, fundamental_data):
self.fundamentalData = fundamental_data
try:
if self.fundamentalData is not None:
# convert XML to dictionary entry
dict_company_info = self.CompanyInfoXMLtoDict(self.fundamentalData)
# add dict entry to dataframe
df_add_row = pd.DataFrame([dict_company_info])
self.df_company_info = self.df_company_info.append(df_add_row, ignore_index=True)
except KeyError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except TypeError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except ValueError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
except IndexError:
print('Ticker: ' + str(self.contNumber) + ' could not get company_info')
self.getNextData()
def getNextData(self):
if self.contracts.get(self.contNumber): # means a contract exists
# so req data
self.reqFundamentalData(self.contNumber, self.contracts[self.contNumber], "ReportSnapshot", [])
self.contNumber += 1 # now get ready for next request
else: # means no more sequentially numbered contracts
print('done')
self.disconnect() # just exit
def CompanyInfoXMLtoDict(self, fundamentals):
soup = bs(fundamentals, 'xml')
df_company_info = pd.DataFrame(data=None, index=None, columns=None)
ticker = ''
longName = ''
fullTimeEmployees = 0
# search for a tag e.g. </IssueID>
for issues in soup.find_all('IssueID'):
# within this tag -> search of unique ID e.g. IssueID type=...
if issues.get('Type') == "Ticker":
ticker = issues.get_text()
break
for coID_i in soup.find_all('CoID'):
if coID_i.get('Type') == "CompanyName":
longName = coID_i.get_text()
break
for employees_i in soup.find_all('Employees'):
fullTimeEmployees = employees_i.get_text()
break
# create result entry row
if ticker is not None and ticker != '':
new_row_dict = {'ticker': ticker, 'name': longName,
'num_employees': fullTimeEmployees}
else:
new_row_dict = {}
return new_row_dict
def FinStmtsXMLtoDF(self, fundamentals, ticker, stmts_type):
today = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
today_date = date.today().strftime("%Y-%m-%d")
if stmts_type == 'annual':
period_type = 'Annual'
else:
period_type = 'Interim'
soup = bs(fundamentals, 'xml')
# build dict
stmts_terms = {}
for terms in soup.find_all("mapItem"):
# add entry to dict -> dict for maping of code to description
stmts_terms[terms.get('coaItem')] = terms.get_text()
bal_l = []
inc_l = []
cas_l = []
for period in soup.find_all('FiscalPeriod'):
# quarterly vs. annually
if period.get('Type') == period_type:
for statement in period.find_all('Statement'):
if statement.find('UpdateType').get('Code') != 'CLA':
dic = {}
stmts_type = statement.get('Type')
# source_date = statement.find('Source').get('Date')
statement_date = statement.find('StatementDate').text
# dic['date'] = source_date
dic['rep_date'] = statement_date
for item in statement.find_all('lineItem'):
# dic[item.get('coaCode')] = item.text
dic[stmts_terms.get(item.get('coaCode'), 'DEFAULT')] = item.text
if stmts_type == 'BAL':
bal_l.append(dic)
# print(stmts_type, date, dic)
elif stmts_type == 'INC':
inc_l.append(dic)
elif stmts_type == 'CAS':
cas_l.append(dic)
df_balance_sheet = pd.DataFrame(bal_l).sort_values('rep_date')
df_income_statement = pd.DataFrame(inc_l).sort_values('rep_date')
df_cash_flow = pd.DataFrame(cas_l).sort_values('rep_date')
# merge all stmts for same rep_date
df_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
df_fin_stmts = df_balance_sheet.merge(df_income_statement, how='left',
left_on=['rep_date'],
right_on=['rep_date'])
df_fin_stmts = df_fin_stmts.merge(df_cash_flow, how='left',
left_on=['rep_date'],
right_on=['rep_date'])
df_fin_stmts.insert(loc=0, column='ticker', value=ticker)
df_fin_stmts.insert(loc=1, column='date_updated', value=today_date)
return df_fin_stmts
def main():
# ----- config
project_data_folder = '/home/data/'
project_data_folder = Path(project_data_folder)
# ticker are stored in a csv file
csv_master_ticker = Path('home/data/ticker/ticker-list.csv')
# load list of tickers
df = pd.read_csv(csv_master_ticker)
list_master_ticker = df['ticker'].tolist()
fusion_company_info = pd.DataFrame(data=None, index=None, columns=None)
fusion_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
fusion_q_fin_stmts = pd.DataFrame(data=None, index=None, columns=None)
client = TestApp('127.0.0.1', 7496, 0)
for ticker in list_master_ticker:
# remove additional postfix for exchange e.g. XYZ.F -> XYZ
ticker_clean = ticker.rstrip('.')
contract = Contract()
contract.symbol = ticker_clean
contract.secType = 'STK'
contract.exchange = "SMART"
contract.currency = 'USD'
client.addContracts(contract)
client.connect('127.0.0.1', 7496, 0)
client.run()
if fusion_company_info.empty:
fusion_company_info = client.df_company_info
else:
fusion_company_info = pd.concat([fusion_company_info, client.df_company_info])
tws_company_info_file_name = 'tws_company_info.parquet'
file_name = project_data_folder / tws_company_info_file_name
try:
if fusion_company_info is not None:
if not fusion_company_info.empty:
fusion_company_info.to_parquet(file_name, engine='pyarrow')
# financial statements - annual
tws_fin_stmts_file_name = 'tws_fin_stmts.parquet'
file_name = project_data_folder / tws_fin_stmts_file_name
try:
if fusion_fin_stmts is not None:
if not fusion_fin_stmts.empty:
fusion_fin_stmts.to_parquet(file_name, engine='pyarrow')
I get an error message
Traceback (most recent call last):
File "...\ibapi\client.py", line 239, in run
self.decoder.interpret(fields)
File "...\ibapi\decoder.py", line 1278, in interpret
self.interpretWithSignature(fields, handleInfo)
File "...\ibapi\decoder.py", line 1259, in interpretWithSignature
method(*args)
TypeError: 'str' object is not callable
python-BaseException
Can someone help me with this error message?
If I remove the for loop and run it only for a single ticker e.g.
client.contracts = {}
contract = Contract()
contract.symbol = 'AMD'
contract.secType = 'STK'
contract.currency = 'USD'
contract.exchange = "SMART"
client.addContracts(contract)
client.connect('127.0.0.1', 7496, 0)
client.run()
I don't get a error message and the dataframe self.company_info get's populated with the correct data of AMD.
General questions:
Is it possible to get via reqFundamentalData() not only the company info "ReportSnapshot", but also the financial statements "ReportsFinStatements" (df_fin_stmts and the function "FinStmtsXMLtoDF") in one request/run ?
I new to python and would expect functions are only executed, if the functions is called within the code, but somehow with the the TWS API (socket, reqID) it seems to work different and it's not fully clear to me when which funciton is called after one and another.
e.g. how do I know that by executing reqFundamentalData() the function fundamentalData() is called. Or e.g. nextValidID() is somehow triggered, but not explicit called within the program. Is there a good tutorial to introduce the process of what functions are called in which order?
Thank you very much
I would like to use python to connect data from Excel to Google Sheet.
But I got error "TypeError: Object of type datetime is not JSON serializable"
I guess the error happened in line with "wks.update_cells(cell_list)". May I know how to solve this kind of error? Thank you
import os
import pandas as pd
import glob
import datetime
import numpy as np
import time
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import json
def numberToLetters(q):
q = q - 1
result = ''
while q >= 0:
remain = q % 26
result = chr(remain+65) + result;
q = q//26 - 1
return result
current_path = os.getcwd()
raw_df = pd.read_excel(glob.glob(os.path.join(current_path , 'Studio*'))[0],sheet_name = "Dashboard", encoding = "ISO-8859-1")
df = raw_df.replace(np.nan, '', regex=True)
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('startup_funding.json', scope)
gc = gspread.authorize(credentials)
wks = gc.open("Testing_Dashboard_Upload").sheet1
wks.clear()
columns = df.columns.values.tolist()
# selection of the range that will be updated
cell_list = wks.range('A1:'+numberToLetters(len(columns))+'1')
# modifying the values in the range
for cell in cell_list:
val = columns[cell.col-1]
# if type(val) is str:
# val = val.decode('utf-8')
cell.value = val
# update in batch
wks.update_cells(cell_list)
#4. handle content
# number of lines and columns
num_lines, num_columns = df.shape
# selection of the range that will be updated
cell_list = wks.range('A2:'+numberToLetters(num_columns)+str(num_lines+1))
# modifying the values in the range
for cell in cell_list:
val = df.iloc[cell.row-2,cell.col-1]
if isinstance(val, (np.float64, np.int64, np.bool_ )):
# val = np.asscalar(val)
# DeprecationWarning: np.asscalar(a) is deprecated since NumPy v1.16, use a.item() instead
val = val.item()
cell.value = val
#5. update in batch
wks.update_cells(cell_list)
Python json library wont serialize datetime object. You have to do it yourself. Find out which value in cell_list is datetime, and convert it to string using strftime method. From your code I think you are setting cell.value to a datetime object. If that so, you may change the line
cell.value = val
to
if isinstance(val, datetime.datetime):
val = val.strftime("%m/%d/%Y, %H:%M:%S")
cell.value = val
I've solved the problem. The problem is related my %PATH%
I have a script which work with an argument. In powershell I've tried the command you can see below;
.\dsrf2csv.py C:\Python27\a\DSR_testdata.tsv.gz
And also you can see the script below,
def __init__(self, dsrf2csv_arg):
self.dsrf_filename = dsrf2csv_arg
dsrf_path, filename = os.path.split(self.dsrf_filename)
self.report_outfilename = os.path.join(dsrf_path, filename.replace('DSR', 'Report').replace('tsv', 'csv'))
self.summary_outfilename = os.path.join(dsrf_path, filename.replace('DSR', 'Summary').replace('tsv.gz', 'csv'))
But when I try to run this script there is no any action. How should I run this script with a file? (example : testdata.tsv.gz)
Note : Script and file in same location.
Full Scritp;
import argparse
import atexit
import collections
import csv
import gzip
import os
SKIP_ROWS = ['HEAD', '#HEAD', '#SY02', '#SY03', '#AS01', '#MW01', '#RU01',
'#SU03', '#LI01', '#FOOT']
REPORT_HEAD = ['Asset_ID', 'Asset_Title', 'Asset_Artist', 'Asset_ISRC',
'MW_Asset_ID', 'MW_Title', 'MW_ISWC', 'MW_Custom_ID',
'MW_Writers', 'Views', 'Owner_name', 'Ownership_Claim',
'Gross_Revenue', 'Amount_Payable', 'Video_IDs', 'Video_views']
SUMMARY_HEAD = ['SummaryRecordId', 'DistributionChannel',
'DistributionChannelDPID', 'CommercialModel', 'UseType',
'Territory', 'ServiceDescription', 'Usages', 'Users',
'Currency', 'NetRevenue', 'RightsController',
'RightsControllerPartyId', 'AllocatedUsages', 'AmountPayable',
'AllocatedNetRevenue']
class DsrfConverter(object):
"""Converts DSRF 3.0 to YouTube CSV."""
def __init__(self, dsrf2csv_arg):
""" Creating output file names """
self.dsrf_filename = dsrf2csv_arg
dsrf_path, filename = os.path.split(self.dsrf_filename)
print(dsrf_filename)
input("Press Enter to continue...")
self.report_outfilename = os.path.join(dsrf_path, filename.replace(
'DSR', 'Report').replace('tsv', 'csv'))
self.summary_outfilename = os.path.join(dsrf_path, filename.replace(
'DSR', 'Summary').replace('tsv.gz', 'csv'))
def parse_blocks(self, reader):
"""Generator for parsing all the blocks from the file.
Args:
reader: the handler of the input file
Yields:
block_lines: A full block as a list of rows.
"""
block_lines = []
current_block = None
for line in reader:
if line[0] in SKIP_ROWS:
continue
# Exit condition
if line[0] == 'FOOT':
yield block_lines
raise StopIteration()
line_block_number = int(line[1])
if current_block is None:
# Initialize
current_block = line_block_number
if line_block_number > current_block:
# End of block, yield and build a new one
yield block_lines
block_lines = []
current_block = line_block_number
block_lines.append(line)
# Also return last block
yield block_lines
def process_single_block(self, block):
"""Handles a single block in the DSR report.
Args:
block: Block as a list of lines.
Returns:
(summary_rows, report_row) tuple.
"""
views = 0
gross_revenue = 0
summary_rows = []
owners_data = {}
# Create an ordered dictionary with a key for every column.
report_row_dict = collections.OrderedDict(
[(column_name.lower(), '') for column_name in REPORT_HEAD])
for line in block:
if line[0] == 'SY02': # Save the financial Summary
summary_rows.append(line[1:])
continue
if line[0] == 'AS01': # Sound Recording information
report_row_dict['asset_id'] = line[3]
report_row_dict['asset_title'] = line[5]
report_row_dict['asset_artist'] = line[7]
report_row_dict['asset_isrc'] = line[4]
if line[0] == 'MW01': # Composition information
report_row_dict['mw_asset_id'] = line[2]
report_row_dict['mw_title'] = line[4]
report_row_dict['mw_iswc'] = line[3]
report_row_dict['mw_writers'] = line[6]
if line[0] == 'RU01': # Video level information
report_row_dict['video_ids'] = line[3]
report_row_dict['video_views'] = line[4]
if line[0] == 'SU03': # Usage data of Sound Recording Asset
# Summing up views and revenues for each sub-period
views += int(line[5])
gross_revenue += float(line[6])
report_row_dict['views'] = views
report_row_dict['gross_revenue'] = gross_revenue
if line[0] == 'LI01': # Ownership information
# if we already have parsed a LI01 line with that owner
if line[3] in owners_data:
# keep only the latest ownership
owners_data[line[3]]['ownership'] = line[6]
owners_data[line[3]]['amount_payable'] += float(line[9])
else:
# need to create the entry for that owner
data_dict = {'custom_id': line[5],
'ownership': line[6],
'amount_payable': float(line[9])}
owners_data[line[3]] = data_dict
# get rid of owners which do not have an ownership or an amount payable
owners_to_write = [o for o in owners_data
if (owners_data[o]['ownership'] > 0
and owners_data[o]['amount_payable'] > 0)]
report_row_dict['owner_name'] = '|'.join(owners_to_write)
report_row_dict['mw_custom_id'] = '|'.join([owners_data[o]
['custom_id']
for o in owners_to_write])
report_row_dict['ownership_claim'] = '|'.join([owners_data[o]
['ownership']
for o in owners_to_write])
report_row_dict['amount_payable'] = '|'.join([str(owners_data[o]
['amount_payable'])
for o in owners_to_write])
# Sanity check. The number of values must match the number of columns.
assert len(report_row_dict) == len(REPORT_HEAD), 'Row is wrong size :/'
return summary_rows, report_row_dict
def run(self):
finished = False
def removeFiles():
if not finished:
os.unlink(self.report_outfilename)
os.unlink(self.summary_outfilename)
atexit.register(removeFiles)
with gzip.open(self.dsrf_filename, 'rb') as dsr_file, gzip.open(
self.report_outfilename, 'wb') as report_file, open(
self.summary_outfilename, 'wb') as summary_file:
dsr_reader = csv.reader(dsr_file, delimiter='\t')
report_writer = csv.writer(report_file)
summary_writer = csv.writer(summary_file)
report_writer.writerow(REPORT_HEAD)
summary_writer.writerow(SUMMARY_HEAD)
for block in self.parse_blocks(dsr_reader):
summary_rows, report_row = self.process_single_block(block)
report_writer.writerow(report_row.values())
summary_writer.writerows(summary_rows)
finished = True
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(
description='Converts DDEX DSRF UGC profile reports to Standard CSV.')
required_args = arg_parser.add_argument_group('Required arguments')
required_args.add_argument('dsrf2csv_arg', type=str)
args = arg_parser.parse_args()
dsrf_converter = DsrfConverter(args.dsrf2csv_arg)
dsrf_converter.run()
In general to execute a python script in powershell like this .\script.py has two requirements:
Add the path to the python binaries to your %path%: $env:Path = $env:Path + ";C:\Path\to\python\binaries\"
Add the ending .py to the pathtext environment variable: $env:PATHEXT += ";.PY"
The latter will only be used in the current powershell session. If you want to add it to all future powershell sessions, add this line to your powershell profile (f.e. notepad $profile).
In your case there is also an issue with the python script you are trying to excute. def __init__(self) is an constructor for a class, like:
class Foo:
def __init__(self):
print "foo"
Did you give us your complete script?
I've done some simple .csv parsing in python but have a new file structure that's giving me trouble. The input file is from a spreadsheet converted into a .CSV file. Here is an example of the input:
Layout
Each set can have many layouts, and each layout can have many layers. Each layer has only one layer and name.
Here is the code I am using to parse it in. I suspect it's a logic/flow control problem because I've parsed things in before, just not this deep. The first header row is skipped via code. Any help appreciated!
import csv
import pprint
def import_layouts_schema(layouts_schema_file_name = 'C:\\layouts\\LAYOUT1.csv'):
class set_template:
def __init__(self):
self.set_name =''
self.layout_name =''
self.layer_name =''
self.obj_name =''
def check_layout(st, row, layouts_schema):
c=0
if st.layout_name == '':
st.layer_name = row[c+2]
st.obj_name = row[c+3]
layer = {st.layer_name : st.obj_name}
layout = {st.layout_name : layer}
layouts_schema.update({st.set_name : layout})
else:
st.layout_name = row[c+1]
st.layer_name = row[c+2]
st.obj_name = row[c+3]
layer = {st.layer_name : st.obj_name}
layout = {st.layout_name : layer}
layouts_schema.update({st.set_name : layout})
return layouts_schema
def layouts_schema_parsing(obj_list_raw1): #, location_categories, image_schema, set_location):
#------ init -----------------------------------
skipfirst = True
c = 0
firstrow = True
layouts_schema = {}
end_flag = ''
st = set_template()
#---------- start parsing here -----------------
print('Now parsing layouts schema list')
for row in obj_list_raw1:
#print ('This is the row: ', row)
if skipfirst==True:
skipfirst=False
continue
if row[c] != '':
st.set_name = row[c]
st.layout_name = row[c+1]
st.layer_name = row[c+2]
st.obj_name = row[c+3]
print('FOUND A NEW SET. SET details below:')
print('Set name:', st.set_name, 'Layout name:', st.layout_name, 'Layer name:', st.layer_name, 'Object name:', st.obj_name)
if firstrow == True:
print('First row of layouts import!')
layer = {st.layer_name : st.obj_name}
layout = {st.layout_name : layer}
layouts_schema = {st.set_name : layout}
firstrow = False
check_layout(st, row, layouts_schema)
continue
elif firstrow == False:
print('Not the first row of layout import')
layer = {st.layer_name : st.obj_name}
layout = {st.layout_name : layer}
layouts_schema.update({st.set_name : layout})
check_layout(st, row, layouts_schema)
return layouts_schema
#begin subroutine main
layouts_schema_file_name ='C:\\Users\\jason\\Documents\\RAY\\layout_schemas\\ANIBOT_LAYOUTS_SCHEMA.csv'
full_path_to_file = layouts_schema_file_name
print('============ Importing LAYOUTS schema from: ', full_path_to_file , ' ==============')
openfile = open(full_path_to_file)
reader_ob = csv.reader(openfile)
layout_list_raw1 = list(reader_ob)
layouts_schema = layouts_schema_parsing(layout_list_raw1)
print('=========== End of layouts schema import =========')
return layouts_schema
layouts_schema = import_layouts_schema()
Feel free to throw any part away that doesn't work. I suspect I've inside my head a little bit here. A for loop or another while loop may do the trick. Ultimately I just want to parse the file into a dict with the same key structure shown. i.e. the final dict's first line would look like:
{'RESTAURANT': {'RR_FACING1': {'BACKDROP': 'restaurant1'}}}
And the rest on from there. Ultimately I am goign to use this key structure and the dict for other purposes. Just can't get the parsing down!
Wouaw, that's a lot of code !
Maybe try something simpler :
with open('file.csv') as f:
keys = f.readline().split(';') # assuming ";" is your csv fields separator
for line in f:
vals = line.split(';')
d = dict(zip(keys, vals))
print(d)
Then either make a better data file (without blanks), or have the parser remembering the previous values.
While I agree with #AK47 that the code review site may be the better approach, I received so many help from SO that I'll try to give back a little: IMHO you are overthinking the problem. Please find below an approach that should get you in the right direction and doesn't even require converting from Excel to CSV (I like the xlrd module, it's very easy to use). If you already have a CSV, just exchange the loop in the process_sheet() function. Basically, I just store the last value seen for "SET" and "LAYOUT" and if they are different (and not empty), I set the new value. Hope that helps. And yes, you should think about a better data structure (redundancy is not always bad, if you can avoid empty cells :-) ).
import xlrd
def process_sheet(sheet : xlrd.sheet.Sheet):
curr_set = ''
curr_layout = ''
for rownum in range(1, sheet.nrows):
row = sheet.row(rownum)
set_val = row[0].value.strip()
layout_val = row[1].value.strip()
if set_val != '' and set_val != curr_set:
curr_set = set_val
if layout_val != '' and layout_val != curr_layout:
curr_layout = layout_val
result = {curr_set: {curr_layout: {row[2].value: row[3].value}}}
print(repr(result))
def main():
# open a workbook (adapt your filename)
# then get the first sheet (index 0)
# and call the process function
wbook = xlrd.open_workbook('/tmp/test.xlsx')
sheet = wbook.sheet_by_index(0)
process_sheet(sheet)
if __name__ == '__main__':
main()