Background:I have created a script that takes a .xlsx file and splits the Groupings of data into separate worksheets, based on the names of each Grouping. E.g., Client A Grouped data will be written to a worksheet named Client A and so forth.Additionally, the script strips out Excel-invalid characters, and miscellaneous strings contained within INVALID_TITLE_CHARS and INVALID_TITLE_NAMES as well as uses values in management_style and model_type to decide which rows are irrelevant and therefore skipped, during the copy process.
My issue:I would like to expand the script so that the Worksheets are alphabetical. E.g., 'Client A, Client B, Client C, and so forth. I have tried to achieve this by client_sheet._sheets.sort(key=lambda output_workbook: output_workbook.title) however, for memory consistency reasons, I have used WriteOnlyWorksheet which seems incompatible with _sheets.Does anyone know if there is another solution? I was hoping to avoid creating a function that reopens the.xlsx
The script:This is the full script:
import time
import logging
import sys
import datetime as dt
import datetime
import requests as requests
import json
import enlighten
import warnings
from openpyxl import load_workbook
from openpyxl import LXML
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
from copy import copy
from configparser import ConfigParser
from requests.auth import HTTPBasicAuth
logger = logging.getLogger()
timestr = datetime.datetime.now().strftime("%Y-%m-%d")
warnings.filterwarnings("ignore")
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
INVALID_TITLE_NAMES = ["zz_ FeeRelationship", "Family"]
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
for name in INVALID_TITLE_NAMES:
title = title.replace(name, "")
return title[:31]
def is_client_row(row, row_dimension):
return row_dimension.outlineLevel == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# target_cell.border = copy(source_cell.border)
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def skip_row(row, row_dimension):
"""
Determine whether a row needs to be skipped and not copied to new workbook
"""
def get_column_value(column):
value = row[column].value or ""
return value.strip()
# skip total line
if row[0].value == "Total":
return True
management_style = [
"Advisory",
"Advisory - No Fee",
"Holding",
"JPAS",
"Liquidity Management",
"Trading",
"",
]
model_type = ["Client", "Holding Account", "Holding Company", "Trust", ""]
management_value = get_column_value(3)
model_value = get_column_value(11)
# Pass on either column
return management_value not in management_style and model_value not in model_type
# # Pass on both columns
# return management_value not in management_style or model_value not in model_type
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
skip_child = False
skipped_parent_outline_level = 0
skipped_rows_per_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
row_dimension = data_sheet.row_dimensions[index]
# verify whether current row is a child of skipped parent
if skip_child and skipped_parent_outline_level < row_dimension.outlineLevel:
skipped_rows_per_client += 1
continue
# reset skip_child when current row is not a child of skipped parent anymore
if (
skip_child
and skipped_parent_outline_level >= row_dimension.outlineLevel
):
skip_child = False
# check whether row needs to be skipped
if skip_row(row, row_dimension):
skipped_rows_per_client += 1
skip_child = True
skipped_parent_outline_level = row_dimension.outlineLevel
continue
# create new sheet found new client is found
if is_client_row(row, row_dimension):
skipped_rows_per_client = 0
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
new_row_index = index - skipped_rows_per_client - client_row_index + 2
client_sheet.row_dimensions[new_row_index] = copy(row_dimension)
client_sheet.row_dimensions[new_row_index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
start = time.time()
configure_logging()
input_file = 'Input_File'+timestr+'.xlsx'
output_file = 'Output_File'+timestr+'.xlsx'
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))
I am able to achieve this using _sheets.sort() the titles. Here is an example to demonstrate the same...
import openpyxl
from openpyxl.cell import WriteOnlyCell
#Create workbook in write_only mode as you did
output_workbook = Workbook(write_only=True)
#Add 4 sheets
ws1 = output_workbook.create_sheet("clientB_sheet_title")
ws2 = output_workbook.create_sheet("clientA_sheet_title")
ws3 = output_workbook.create_sheet("clientD_sheet_title")
ws4 = output_workbook.create_sheet("clientC_sheet_title")
#Add some data at start
cell = WriteOnlyCell(ws1, value="hello clientB")
ws1.append([cell, 3.14, None])
cell = WriteOnlyCell(ws2, value="hello clientA")
ws2.append([cell, 3.14, None])
cell = WriteOnlyCell(ws3, value="hello clientD")
ws3.append([cell, 3.14, None])
cell = WriteOnlyCell(ws4, value="hello clientC")
ws4.append([cell, 3.14, None])
### The key to your question - The sorting of titles ###
output_workbook._sheets.sort(key=lambda ws: ws.title)
#Finally save
output_workbook.save("output_file.xlsx")
Output excel
OPTION - 2 (better and safer option) using move_sheet()
As per Charlie Clark's recommendation, the better and safer option is to use move_sheets(). The same has been added. Note that I am only including the code that will replace the last two commands (_sheet and save). Results are the same...
asc_sheetlist = output_workbook.sheetnames
asc_sheetlist.sort()
for pos, name in enumerate(asc_sheetlist):
output_workbook.move_sheet(name, pos - output_workbook.sheetnames.index(name))
output_workbook.save("output_file.xlsx")
Based on Redox's answer, I was able to throw together a super-simple function to achieve this -
def alphabetical_client_sheet_sort(output_file):
workbook = load_workbook(output_file)
workbook._sheets.sort(key=lambda output_file: output_file.title)
workbook.save(output_file)
workbook.close()
This script looks interrogates a csv containing species names against a database in a csv and returns if they are in both. The issue is while it is still reading all the terms to search fine, it is only searching the first one. i.e. if I print speciesl before 'for row in p' all species names are returned correctly
from pathlib import Path
import os
import csv
p = csv.reader(open('Paldat.csv','r',newline=''), delimiter=',')
with open('newsssssss.csv','r',newline='\n')as r:
for line in r:
taxons=line.split(',')
no = ['\r\n']
noo = ['\n']
if taxons == no:
continue
elif taxons == noo:
continue
else:
speciesl = []
for val in taxons:
val = val.replace('\n','')
speciesl.append(val)
g=speciesl[0].lower()
if len(speciesl) < 2:
continue
else:
s=speciesl[1].lower()
for row in p: #This loop seems to be the issue
genus = row[0].lower()
species = row[1].lower()
if g == genus and s == species:
print('Perfect match')
print(g)
elif s == species:
print(speciesl)
print('Species found')
else:
continue
else:
continue
Here is part of Paldat.csv:
Camassia,leichtlinii,monad,monad,large (51-100 µm),-,-,-,-,-,sulcate,heteropolar,oblate,-,elliptic,-,-,boat-shaped,no suitable term,aperture(s) sunken,1,sulcus,sulcate,aperture membrane ornamented,-,-,-,"reticulate, heterobrochate, perforate",-,-,-,-,-,-,-,-,-,-,-,present,,
Cistus,parviflorus,monad,monad,medium-sized (26-50 µm),-,-,-,-,-,colporate,isopolar,-,spheroidal,circular,-,-,spheroidal,circular,"aperture(s) sunken, not infolded",3,colporus,"colporate, tricolporate",-,-,-,-,striato-reticulate,-,-,-,-,-,-,-,-,-,-,-,absent,,
Camellia,japonica,monad,monad,medium-sized (26-50 µm),41-50 µm,36-40 µm,41-50 µm,41-50 µm,41-50 µm,colpate,isopolar,-,spheroidal,circular,oblique,prolate,-,triangular,aperture(s) sunken,3,colpus,"colpate, tricolpate",operculum,"granulate, scabrate, reticulate",-,-,microreticulate,-,-,-,-,-,-,-,-,-,-,-,-,,
Camellia,sinensis,monad,monad,medium-sized (26-50 µm),41-50 µm,36-40 µm,41-50 µm,41-50 µm,41-50 µm,colporate,isopolar,oblate,-,triangular,oblique,isodiametric,-,triangular,aperture(s) sunken,3,colporus,"colporate, tricolporate",operculum,"scabrate, verrucate, gemmate",-,-,"verrucate, perforate",-,-,-,-,-,-,-,-,-,-,-,-,,
And part of newsssssss.csv:
Camassia,leichtlinii
Camellia,japonica
Camellia,sinensis
Chrysanthemum,leucanthemum
Cirsium,arvense
Cissus,quadrangularis
Try removing "newline='\n'" from the "open" line.
My task is to cout all top senders and top recievers of user's email.
So the plan is to get all user id's, put them in a dictionary, count their amount and print.
I tried this but it doesn't work very well with INBOX label (10 000+ messages):
import base64
import email
import re
import operator
from googleapiclient import errors
from quickstart import service
def find(st):
for i in range(0,len(st)):
tmp = str(st[i])
for j in range(0,len(tmp)):
if tmp[j] == 'T' and tmp[j+1] == 'o' and tmp[j-1] == "'" and tmp[j+2] == "'":
return i
pass
def getTop(n):
try:
if n == 1:
label_ids = "INBOX"
else:
label_ids = "SENT"
user_id = "me"
topers = service.users().labels().get(userId = user_id,id = label_ids).execute()
count = topers['messagesTotal']
print(count)
topers = service.users().messages().list(userId = user_id, labelIds = label_ids).execute()
arrId = []
for i in range(0,count):
arrId.append(topers['messages'][i]['id'])
st = []
for i in range(0,count):
message = service.users().messages().get(userId=user_id,
id=arrId[i],
format = 'metadata').execute()
head = message['payload']['headers']
index = find(head)
obval = head[index]['value']
tmp = str(obval)
tmp =tmp.split('<', 1)[-1]
tmp = tmp.replace('>',"")
st.append(tmp)
cnt = 0
mvalues = {}
for mail in st:
if not mail in mvalues:
mvalues[mail] = 1
else:
mvalues[mail]+= 1
sorted_values = sorted(mvalues.items(),key= operator.itemgetter(1))
ln = len(sorted_values)
for j in range(1,6):
print(sorted_values[-j])
pass
except errors.HttpError as error:
print('An error occurred: %s' % error)
My question is: what is the fastest and the most correct way to get all these user emails?
If I have a lot of messages, using a while and make a request every time is not the best way I guess. I'm trying to figure this out for about 4 days. Help
I have the following data sets:
Data set #1 that provides shows and the number of viewers of that show:
TVShow1,25
TVShow2,30
TVShow3,7
TVShow1,15
Data set #2 that provides channels that broadcast each show:
TVShow4,BBC
TVShow2,COM
TVShow1,TNT
TVShow3,TNT
I want to calculated the total number of viewers of each show on the channel TNT, e.g.
TVShow1 40
TVShow3 7
I have the following mapper:
#!/usr/bin/env python
import sys
for line in sys.stdin:
line = line.strip()
key_value = line.split(",")
key_in = key_value[0]
value_in = key_value[1]
if (value_in == 'TNT' or value_in.isdigit()):
print( '%s\t%s' % (key_in, value_in) )
And the following reducer:
#!/usr/bin/env python
import sys
prev_TV_show = " "
line_cnt = 0
tnt_found = False
curr_TV_show_total_cnt = 0
for line in sys.stdin:
line = line.strip()
key_value = line.split('\t')
line_cnt = line_cnt+1
curr_TV_show = key_value[0]
value_in = key_value[1]
if curr_TV_show != prev_TV_show:
prev_TV_show = curr_TV_show
if (line_cnt>1 and tnt_found == True):
print('{0} {1}'.format(curr_TV_show,curr_TV_show_total_cnt))
tnt_found = False
curr_TV_show_total_cnt = 0
if (value_in == 'TNT'):
tnt_found = True
else:
curr_TV_show_total_cnt += int(value_in)
Then I tested the code as follows:
cat data_file*.txt | ./my_mapper.py | sort | ./my_reducer.py
However, it seams that total number of viewers of the first line is incorrect. It looks like it is merged between two TV shows. Is there any error in the code related to managing the first line?
I think that there are 2 problems in your code -
Updating prev_TV_show causes you to print the wrong value. You
actually want to print the prev_TV_show with its' count, not the
curr_TV_show
Printing the last iteration value - you need to add an additional print (+condition) outside the loop