Transform lines from .txt file into .xls columns pandas - python

i have a text file that contains datas on lines like his:
Total usage 2022-09-17T06:02:50+02:00 for vob "/Tmobile_Cro/TMobileCro" is 7868.0 Mb
Total usage 2022-09-17T06:04:18+02:00 for vob "/XpressCM/SX-BASE" is 25265.7 Mb
Total usage 2022-09-17T06:02:56+02:00 for vob "/vobs/LI" is 5916.9 Mb
I want to process this data and export into an excel file as:
TAG Size
/Tmobile_Cro/TmobileCro 7868.0 Mb
/XpressCM/SX-BASE 25265.7 Mb
This is my code:
import xlrd, xlwt, re
from svnscripts.timestampdirectory import createdir, path_dir
import os
def clearcasesize():
pathdest = path_dir()
dest = createdir()
txtName = rf"{pathdest}\vobs_size.txt"
workBook = xlwt.Workbook(encoding='ascii')
workSheet = workBook.add_sheet('sheet1')
fp = open(txtName, 'r+b')
workSheet.write(0, 0, "TAG")
workSheet.write(0, 1, "Size")
row = 0
entries = 0
fullentry = []
for linea in fp.readlines():
str_linea = linea.decode('gb2312', 'ignore')
str_linea = str_linea[:-2] # str string
txt = str_linea
arr = str_linea
if arr[:9] == "Total":
txt = arr
entries += 1
s = txt.index("/")
e = txt.index('"', s)
txt = txt[s:e]
fullentry.append(txt)
elif arr.find("is") >= 0:
entries += 1
txt = arr
s = txt.index("is")
txt1 = txt[s + 7:20]
fullentry.append(txt1)
if (row == 65536):
break;
finalarr = []
finalarr1 = []
temp = 0
row = 1
for r in fullentry:
finalarr.append(r)
temp += 1
if temp == 12:
finalarr1.append(finalarr)
temp = 0
col = 0
for arr in finalarr:
workSheet.write(row, col, arr)
col += 1
row += 1
finalarr.clear()
if (row == 65536):
break;
workBook.save(os.path.join(dest, "ClearcaseSize.xls"))
fp.close()
clearcasesize()
The code should work, but its only creating the name of columns "Tag and Size"
The idea of my script is that he locate the first argg of the line "Total", and after he puts into 1 column the args that we find into " " , and after using the other arr.fin "is" to add the size into the second line, but its not working...

Based on your short sample:
Total usage 2022-09-17T06:02:50+02:00 for vob "/Tmobile_Cro/TMobileCro" is 7868.0 Mb
Total usage 2022-09-17T06:04:18+02:00 for vob "/XpressCM/SX-BASE" is 25265.7 Mb
Total usage 2022-09-17T06:02:56+02:00 for vob "/vobs/LI" is 5916.9 Mb
Why don't you try pandas like this:
import pandas as pd
with open("sample_file.txt") as f:
lines = [
line.strip().split(" vob ")[-1].replace('"', "").split(" is ")
for line in f.readlines()
]
df = pd.DataFrame(
[l for l in lines if not l[0].startswith("/vobs")],
columns=["TAG", "Size"],
)
df.to_excel("sample_file.xlsx", index=False)
I don't have Excel but you should get an xlsx file that looks like this:

Related

How to wrap text in excel using Python?

I created a script which parse YAML files to Excel. I've made part of code which adjust width of column but now I want to wrap text for Desciprtion column.
After script execution:
What I want:
Here is my code:
import yaml
import os
import pandas as pd
from openpyxl.utils import get_column_letter
import sys
def yaml_to_excel(dictionary, row):
for key, value in dictionary.items():
if type(value) is dict:
yield from yaml_to_excel(value, row + 1)
elif type(value) is list:
for item in value:
yield from yaml_to_excel(item, row + 1)
row += 1
else:
yield (key, value, row)
def get_all_subdirectiries_in_path(root_dir):
all_subdirectories = []
for file in os.listdir(root_dir):
d = os.path.join(root_dir, file)
if os.path.isdir(d):
all_subdirectories.append(d)
return all_subdirectories
def dfs_tabs(df_list, sheet_list, file_name, columns):
writer = pd.ExcelWriter(file_name,engine='openpyxl')
for dataframe, sheet, columns_list in zip(df_list, sheet_list, columns):
dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0, index=False, columns=columns_list)
for column in dataframe:
column_width = max(dataframe[column].astype(str).map(len).max() + 5, len(column) + 5)
col_letter = get_column_letter(dataframe.columns.get_loc(column) + 1)
writer.sheets[sheet].column_dimensions[col_letter].width = column_width
writer.save()
def get_nested_dir_path(main_dir):
if any(File.endswith(".yml") for File in os.listdir(main_dir)):
yield main_dir
else:
for file in os.listdir(main_dir):
yield from get_nested_dir_path(main_dir + f"/{file}")
def main(dir_path):
all_sheets = []
sheet_names = []
all_columns = []
for subdirectory in get_nested_dir_path(dir_path):
final_object = []
last_row = 1
current_row = 1
for file in os.listdir(subdirectory):
# Check if YAML file exist
if (file.split(".")[1] == "yml") | (file.split(".")[1] == "yaml"):
with open(subdirectory + f"/{file}", "r", encoding="utf-8") as f:
dataMap = yaml.safe_load(
f,
)
last_row += 1
hierarchy_obj = {}
hierarchy = 1
prev_value = ""
prev_temp_depth = 2
current_row += 1
starting_row = current_row
for key, value, temp_depth in yaml_to_excel(dataMap, last_row):
if key not in hierarchy_obj:
hierarchy_obj[key] = hierarchy
hierarchy += 1
if prev_value != "":
if hierarchy_obj[key] < hierarchy_obj[prev_value]:
current_row += 1
if (temp_depth > prev_temp_depth) & (
hierarchy_obj[key] > hierarchy_obj[prev_value]
):
current_row += 1
if type(value) is bool:
if value:
value = 'yes'
else:
value = 'no'
final_object.append(
{
"column": hierarchy_obj[key],
"value": value,
"row": current_row,
}
)
prev_value = key
prev_temp_depth = temp_depth
columns_arr = []
for column_name in hierarchy_obj.keys():
columns_arr.append(column_name)
all_columns.append(columns_arr)
last_row_in_excel = final_object[len(final_object)-1]['row']
main_arr = []
for i in range(last_row_in_excel):
main_arr.append(["" for i in range(len(columns_arr))])
main_arr = [main_arr]
for item in final_object:
if (type(item['value']) == int) | (type(item['value']) == float):
main_arr[0][item['row'] - 1][item['column'] - 1] = item['value']
else:
main_arr[0][item['row'] - 1][item['column'] -1] = item['value']
result = pd.DataFrame(main_arr[0], columns = columns_arr)
result = result[1:]
all_sheets.append(result)
sheet_names.append(subdirectory[subdirectory.rfind('/')+1 :].replace("/", " ").replace("_", " ")[0:31])
dfs_tabs(all_sheets, sheet_names, 'test.xlsx', all_columns)
if __name__ == "__main__":
dir_path = 'Lands'
main(dir_path)
And below you can find YAML file which I base of:
Country: France
Capital City: Paris
Language: French
Description: |-
A country whose metropolitan part is in Western Europe, and also has overseas territories on other continents.
The most important people:
President: Emmanuel Macron
Prime Minister: Élisabeth Borne
Year: 2020
Population:
- Total population: 67 522 000
Year: 2021
Nationality:
- French population: 87%
German population: 2%
Year: 2022
Paris neighborhoods:
- 1st district of Paris: Louvre
Paris II district: Bourse
Year: 2023
Additional information:
To run my script you must create folder Lands/France/france.yml
I will be grateful for any tips how to wrap text in Description column!
I suggest you in this particular situation to parse the .YAML file with pandas.json_normalize and use the format class of xlswriter to wrap the text in the column Description.
Try this :
import pandas as pd
from yaml import safe_load
with open(r"Lands\France\france.yaml", 'r', encoding='utf-8') as f:
df = pd.json_normalize(safe_load(f))
# --- Selecting columns
description_index = df.columns.get_loc("Description")
df.iloc[:, 0:description_index+1]
with pd.ExcelWriter('final_result.xlsx') as writer :
df.to_excel(writer, sheet_name='Sheet1', index=False)
# --- Wrapping the text of column D
workbook=writer.book
worksheet = writer.sheets['Sheet1']
format = workbook.add_format({'text_wrap': True})
worksheet.set_column(description_index, description_index, None, format)
# --- Auto-fit all the columns
for column in df:
column_width = max(df[column].astype(str).map(len).max(), len(column))
col_idx = df.columns.get_loc(column)
worksheet.set_column(col_idx, col_idx, column_width)
# Output (in Excel) :

Load a apreadsheet and copy a row and pasted it in a different location

How can I copy a row for example from D51 to F51 and paste these values in the row T20 to AF20.
I know how to load a spreadsheet
workbook = load_workbook(output)
sheet = workbook.active
But I dont know how to itenarate in a loop to get this
sheet["T2"] = "=D6"
sheet["U2"] = "=E6"
sheet["V2"] = "=F6"
sheet["W2"] = "=G6"
sheet["X2"] = "=H6"
sheet["Y2"] = "=I6"
sheet["Z2"] = "=J6"
sheet["AA2"] = "=K6"
sheet["AB2"] = "=L6"
sheet["AC2"] = "=M6"
sheet["AD2"] = "=N6"
sheet["AE2"] = "=O6"
sheet["AF2"] = "=P6"
You can achieve this by using code below...
Note that the file output.xlsx is opened, updated and saved. The function num_to_excel_col is borrowed from here.
This will update columns 20 (T) onwards for the next 15 columns (all row 2) with the text as "=D6", "=E6", etc. The num_to_col function will convert the col number to equivalent excel string (for eg. 27 will be converted to AA, etc.)
import pandas as pd
import numpy as np
import openpyxl
workbook = openpyxl.load_workbook('output.xlsx')
ws = workbook.active
def num_to_excel_col(n):
if n < 1:
raise ValueError("Number must be positive")
result = ""
while True:
if n > 26:
n, r = divmod(n - 1, 26)
result = chr(r + ord('A')) + result
else:
return chr(n + ord('A') - 1) + result
outcol = 4 #Paste in col 'D'
for col in range(20,35): #col 20 is T and doing this for next 15 columns
txt = "="+num_to_excel_col(outcol)+"6"
print(txt)
ws.cell(row=2, column=col).value = txt
outcol += 1
workbook.save("output.xlsx")

Fastest way to create dataframe from hundreds csv files

I have 150 csv files with two column (time and site). I want to read each file, creating frequency dictionary ({'site':[site_number, number of occurrences site]}) and creating DataFrame consists of 11 columns (user_id, site1, site2, ...site10), user_id parsing from name of file (../user0001.csv). Each row in DataFrame unique session of 10 site visits. My codes worked on 150 files 150 seconds (its terrible). How can i improve it?
def prepare_3(path_to_csv_files, session_length=10):
word_freq = {}
freq_dict = {}
word_count = 0
row = []
columns = []
columns.append('user_id')
columns.extend(['site' + str(i) for i in range(1, session_length+1)])
lst_files = sorted(glob(path_to_csv_files))
for csv in lst_files:
user = int(csv[csv.find('.')-4:csv.find('.')])
frame = []
frame.append(user)
site_count = 0
with open(csv, 'r') as f:
f.readline()
for line in f:
site = line[line.find(',') + 1:].rstrip()
site_count += 1
if site in word_freq:
word_freq[site][1] += 1
else:
word_count += 1
word_freq[site] = [word_count, 1]
if site_count > session_length:
site_count = 1
row.append(frame)
frame = []
frame.append(user)
frame.append(word_freq[site][0])
else:
frame.append(word_freq[site][0])
row.append(frame)
df = pd.DataFrame(data=row, columns=columns, dtype=int)
df.fillna(0 ,inplace=True)
return df, word_freq

Python Pandas How to save output to csv

Hello now im working on my project. I want to get candidate of text block by using algorithm below.
My input is a csv document which contain :
HTML column : the html code in a line
TAG column : the tag of html code in a line
Words : the text inside the tag in aline
TC : the number of words in a line
LTC : the number of anchor words in a line
TG : the number of tag in a line
P : the number of tag p and br in a line
CTTD : TC + (0.2*LTC) + TG - P
CTTDs : the smoothed CTTD
This is my algorithm to find candidate of text block. I make the csv file into dataframe using pandas. I am using CTTDs,TC and TG column to find the candidate.
from ListSmoothing import get_filepaths_smoothing
import pandas as pd
import numpy as np
import csv
filenames = get_filepaths_smoothing(r"C:\Users\kimhyesung\PycharmProjects\newsextraction\smoothing")
index = 0
for f in filenames:
file_html=open(str(f),"r")
df = pd.read_csv(file_html)
#df = pd.read_csv('smoothing/Smoothing001.csv')
news = np.array(df['CTTDs'])
new = np.array(df['TG'])
minval = np.min(news[np.nonzero(news)])
maxval = np.max(news[np.nonzero(news)])
j = 0.2
thetaCTTD = minval + j * (maxval-minval)
#maxGap = np.max(new[np.nonzero(new)])
#minGap = np.min(new[np.nonzero(new)])
thetaGap = np.min(new[np.nonzero(new)])
#print thetaCTTD
#print maxval
#print minval
#print thetaGap
def create_candidates(df, thetaCTTD, thetaGAP):
k = 0
TB = {}
TC = 0
for index in range(0, len(df) - 1):
start = index
if df.ix[index]['CTTDs'] > thetaCTTD:
start = index
gap = 0
TC = df.ix[index]['TC']
for index in range(index + 1, len(df) - 1):
if df.ix[index]['TG'] == 0:
continue
elif df.ix[index]['CTTDs'] <= thetaCTTD and gap >= thetaGAP:
break
elif df.ix[index]['CTTDs'] <= thetaCTTD:
gap += 1
TC += df.ix[index]['TC']
if (TC < 1) or (start == index):
continue
TB.update({
k: {
'start': start,
'end': index - 1
}
})
k += 1
return TB
def get_unique_candidate(TB):
TB = tb.copy()
for key, value in tb.iteritems():
if key == len(tb) - 1:
break
if value['end'] == tb[key+1]['end']:
del TB[key+1]
elif value['start'] < tb[key+1]['start'] < value['end']:
TB[key]['end'] = tb[key+1]['start'] - 1
else:
continue
return TB
index += 1
stored_file = "textcandidate/textcandidate" + '{0:03}'.format(index) + ".csv"
tb = create_candidates(df, thetaCTTD, thetaGap)
TB = get_unique_candidate(tb)
filewrite = open(stored_file, "wb")
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
output_df.to_csv(stored_file)
writer = csv.writer(filewrite, lineterminator='\n')
filewrite.close
ThetaCTTD is 10.36 and thethaGap is 1.
The output is
The output means there are 2 candidates of text block . First the candiate of text block start from line number 215 and end line number 225 (like the pict bellow). And the other candidate of text block start from line number 500 and end line number 501.
My question is how to save the output into csv and not only the number of line but the range of the text block and the others column will appear as the output too?
My expected output is like the screenshot of candidate text block is like this one
Assuming your output is a list of dictionaries:
pd.concat([df.loc[d['start']:d['end']] for (k, d) in TB.iteritems()])
Note that we slice by label, so d['end'] will be included.
Edit: add the candidate number in a new column.
It's cleaner to write a loop than to do two concat operations:
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
It's also faster to concatenate all dataframes at once at the end.

How to split merged cells in excel workbook using python

Is there any way to split/unmerge cells in excel workbook using python? What I want is explained below -
The result should a new excel file with following entries -
My solution using xlrd to copy the same string for all merged column is as given below -
[Note: "formatted_info = True" flag is not yet implemented in xlrd which I am using hence I cannot directly get the list of merged cells.. I am not supposed to upgrade xlrd on the setup.]
def xlsx_to_dict():
workbook = xlrd.open_workbook(xlsfile)
worksheet_names = workbook.sheet_names()
for worksheet_name in worksheet_names:
worksheet = workbook.sheet_by_name(worksheet_name)
num_rows = worksheet.nrows - 1
num_cells = worksheet.ncols - 1
curr_row = -1
header_row = worksheet.row(0)
columns = []
for cell in range(len(header_row)):
value = worksheet.cell_value(0, cell)
columns.append(value)
cities = []
for row in range(1,num_rows):
value = worksheet.cell_value(row,0)
type = worksheet.cell_type(row,0)
if not value == "":
cities.append(value)
names = []
for row in range(1,num_rows):
value = worksheet.cell_value(row,1)
type = worksheet.cell_type(row,1)
if not value == "":
names.append(value)
current_city = cities[0]
result_dict = {}
for curr_row in range(1,num_rows):
row = worksheet.row(curr_row)
curr_cell = -1
curr_name = names[0]
while curr_cell < num_cells:
curr_cell += 1
cell_value = worksheet.cell_value(curr_row, curr_cell)
if cell_value in cities and curr_cell == 0:
current_city = cell_value
if not result_dict.has_key(current_city):
result_dict[current_city] = {}
continue
if cell_value == "" and curr_cell == 0:
continue
if cell_value in names and curr_cell == 1:
curr_name = cell_value
if not result_dict[current_city].has_key(curr_name):
result_dict[current_city][curr_name] = {}
continue
if cell_value == "" and curr_cell == 1:
continue
try:
result_dict[current_city][curr_name]['Phone'].append(cell_Value)
except:
result_dict[current_city][curr_name]['Phone'] = [cell_value]
The above function will return python dictionary as below -
{ 'New York' : { 'Tom' : [92929292, 33929] }, ........}
I will then traverse the directory and write new excel.
However, I want some generic way of splitting merged cells.
This function gets the "real" cell value, i.e., the value of the merged cell if the coordinates are anywhere inside the merged cell.
def unmergedValue(rowx,colx,thesheet):
for crange in thesheet.merged_cells:
rlo, rhi, clo, chi = crange
if rowx in xrange(rlo, rhi):
if colx in xrange(clo, chi):
return thesheet.cell_value(rlo,clo)
#if you reached this point, it's not in any merged cells
return thesheet.cell_value(rowx,colx)
Loosely based on http://www.lexicon.net/sjmachin/xlrd.html#xlrd.Sheet.merged_cells-attribute
Very innefficient, but should be acceptable for small-ish spreadsheets.
if your file has no empty cells in the middle, this may help, read the file, do some job, rewrite it.
def read_merged_xls(file_contents):
book = xlrd.open_workbook(file_contents=file_contents)
data = []
sheet = book.sheet_by_index(0)
for rx in range(sheet.nrows):
line = []
for ry in range(sheet.ncols):
cell = sheet.cell_value(rx,ry)
if not cell:
cell = data[-1][ry] if data else ''
line.append(cell)
data.append(line)
return data
import xlrd
import xlsxwriter
import numpy as np
import pandas as pd
def rep(l,i):
j= i
while(j>=0):
if not l[j-1] == u'':
return l[j-1]
else:
j = j-1
def write_df2xlsx(df,filename):
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(filename,engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index = False)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
def csv_from_excel(filename):
wb = xlrd.open_workbook(filename)
worksheet_names = wb.sheet_names()
for worksheet_name in worksheet_names:
sh = wb.sheet_by_name(worksheet_name)
#To find the headers/column names of the xlsx file
header_index = 0
for i in range(sh.nrows):
if(len(filter(lambda x: not (x.value == xlrd.empty_cell.value), sh.row(i))) == len(sh.row(i))):
header_row = sh.row(i)
header_index = i
break
columns = []
for cell in range(len(header_row)):
value = sh.cell_value(header_index, cell)
columns.append(value)
rows = []
for rownum in range(header_index+1,sh.nrows):
rows.append(sh.row_values(rownum))
data = pd.DataFrame(rows,columns = columns)
cols = [col for col in data.columns if u'' in list(data[col])]
res = []
for col in cols:
t_list = list(data[col])
res.append(map(lambda x,y: rep(list(data[col]),y[0]) if x == u'' else x,t_list,enumerate(t_list)))
for (col,r) in zip(cols,res):
data[col] = pd.core.series.Series(r)
write_df2xlsx(data,'ResultFile.xlsx')

Categories

Resources