python script works on windows but not in linux - python

I am working on a project where I want to import a huge number of data in arangodb. All the data are in .xlsx form with multiple worksheets. So I wrote a script that converts the .xlsx files to json files (one json file for each worksheet) and then establishes a connection with arango db and does bulk import of data in arangodb. So I wrote the script in jupyter notebook installed in a windows pc with the latest Anaconda version and it works like a charm with either local or remote database connections. So after I saw that the code works I copied the script to my CentOS7 virtual server and ran it and it crashed. I ran it in a physical machine with ubuntu 19.10 and it crashed also. Both linux machines were updated, and worked also with the latest Anaconda version. Also the script was ran both in command line as .py file and as .ipynb file from jupyter notebook in all the machines (windows and linux). In windows it works perfectly in linux it crashes when it starts to convert the first .xlsx file. The code for the script is this:
from zipfile import ZipFile
from bs4 import BeautifulSoup
import pandas as pd
from xlsx2csv import Xlsx2csv as x2csv
import os
import hashlib
import json
import numpy as np
from arango import ArangoClient
import glob
filelist = []
hash_dict = {}
current_folder = os.getcwd()
for file in os.listdir(current_folder):
if file.endswith(".xlsx"):
filelist.append(file)
#create a list of all worksheets contained in the worksheet
def create_sheet_list(file):
with ZipFile(file) as zipped_file:
summary = zipped_file.open(r'xl/workbook.xml').read()
soup = BeautifulSoup(summary, "xml")
sheets = [sheet.get("name") for sheet in soup.find_all("sheet")]
return sheets
#create an array of dataframes from all the worksheets
def create_dataframes(file):
xl = pd.ExcelFile(file)
xl.sheet_names
dfs = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
return dfs
def create_json(file,sheets,dfs):
print(("The file contains {} sheets").format(len(sheets)))
count = 0
for i in sheets:
json_filelist = []
count = count + 1
#produce the dataframe and check if there are any encoding errors
try:
df = dfs[i]
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header
df = df.fillna(0)
hash_str_name = file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_'))
hash_str=int(hashlib.sha1(hash_str_name.encode('utf-8')).hexdigest(), 16) % (10 ** 10)
values = str(hash_str)
df['Hash']=np.nan
df['Hash']= df['Hash'].fillna(value=values)
#hash_dict.update({hash_str_name : values})
hash_dict[hash_str_name] = values
json_file = df.reset_index().to_json(new_path+"/"+file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_')), orient = "records")
#For the dataframes that will get an error because of encoding a different way of conversion will be used
except UnicodeEncodeError:
x2csv(file, outputencoding="utf-8").convert(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')),count)
df = pd.read_csv(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')), header = 1)
hash_str_name = file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_'))
hash_str=int(hashlib.sha1(hash_str_name.encode('utf-8')).hexdigest(), 16) % (10 ** 10)
values = str(hash_str)
df['Hash']=np.nan
df['Hash']= df['Hash'].fillna(value=values)
#hash_dict.update({hash_str_name : values})
hash_dict[hash_str_name] = values
json_file = df.reset_index().to_json(new_path+"/"+file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_')), orient = "records")
os.remove(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')))
#Create connection with the Database
def create_db_connection():
client = ArangoClient(hosts='http://127.0.0.1:8529')
db = client.db('CEM', username='root', password='123456')
return db
#Get the list of the .json files from all the folders
def list_of_json():
path = os.getcwd()
folders = os.listdir(path)
json_names = []
for folder in folders:
files = glob.glob(path+"/"+folder+"/"+"*.json")
if len(files)>0:
json_names.append(files)
return json_names
#Get the list of the collections in the database
def list_of_collections(sheets,db):
for col in sheets:
col = col.replace(' ','_')
if db.has_collection(col):
collect = db.collection(col)
else:
collect = db.create_collection(col)
collection = db.collections()
collection = [i['name'] for i in collection if i['name'][0].isupper()]
return collection
#Import the data from the .json files to the appropriate collections
def import_data(json_names,collection, db):
for x in json_names:
for y in x:
for z in collection:
with open(y, "r") as json_file:
if y.endswith("{}.json".format(z)):
data = json.load(json_file)
z = db.collection(z)
z.import_bulk(data)
for file in filelist:
try:
#create the folder where the .json files from that UFED will be stored
new_folder = os.mkdir(os.getcwd()+"/"+file.strip('.xlsx'))
#get the path for the new folder
new_path = "{0}/{1}".format(os.getcwd(), file.strip('.xlsx'))
except FileExistsError:
#if the folder already exists just get its path
new_path = "{0}/{1}".format(os.getcwd(), file.strip('.xlsx'))
print(new_path)
#print the name of the file that's being analyzed so that we have a measure of progress
print(("Now I am working with {} file").format(file))
#call the functions and run the program
create_sheet_list(file)
create_dataframes(file)
sheets = create_sheet_list(file)
dfs = create_dataframes(file)
create_json(file,sheets,dfs)
df_dict = pd.DataFrame(list(hash_dict.items()), index = None, columns = ["File_name", "Hash_num"])
df_dict.to_json(current_folder+"/hash_list.json", orient = "records")
create_db_connection()
db = create_db_connection()
#create_collections(sheets,db)
list_of_json()
json_names = list_of_json()
list_of_collections(sheets,db)
collection = list_of_collections(sheets,db)
import_data(json_names,collection,db)
Can anyone help?

Related

Run Python script on all files in the directory after a file has run, create a new folder and name the folder after the file

I wrote a Python script TestData.pythat uses Pandas and NumPy to test a CSV for data anomalies. It inputs one CSV and outputs 4 new ones. For each input file that needs testing I do the following:
Copy the name of the unknown file. In this example: unknownfilename1.csv
Create a folder.
Rename the New Folder by pasting in unknownfilename1.csv, removing the .csv
Paste unknownfilename1.csv into data = pd.read_csv("unknownfilename0.csv")
Drag TestData.py into the folder unknownfilename1
Finally, run TestData.py
import pandas as pd
import numpy as np
# Import raw data
data = pd.read_csv("unknownfilename1.csv", encoding='latin-1' )
#################################################
# Over 500 lines of code using Pandas and Numpy #
#################################################
# failed at least one testcase, needs to be fixed before importing.
failed.to_csv("C:/users/path/Failed.csv", index = False)
# Output passed rows.
passed.to_csv("C:/users/path/Passed.csv", index = False)
# Ready to import.
newimpomatic.to_csv("C:/users/path/Import.csv", index = False)
# Duplicates IDs
duplicated.to_csv("C:/users/path/duplicated.csv", index = False)
I would like each file to be tested in:
C:/users/path/unknownfilename1.csv
C:/users/path/unknownfilename2.csv
C:/users/path/unknownfilename3.csv
To output:
C:/users/path/unknownfilename1/Failed.csv
C:/users/path/unknownfilename1/Passed.csv
C:/users/path/unknownfilename1/Import.csv
C:/users/path/unknownfilename1/duplicated.csv
C:/users/path/unknownfilename2/Failed.csv
C:/users/path/unknownfilename2/Passed.csv
C:/users/path/unknownfilename2/Import.csv
C:/users/path/unknownfilename2/duplicated.csv
C:/users/path/unknownfilename3/Failed.csv
C:/users/path/unknownfilename3/Passed.csv
C:/users/path/unknownfilename3/Import.csv
C:/users/path/unknownfilename3/duplicated.csv
If I have 100s different files in a folder. What is the easiest way to add something to my script to test all files, after each file is tested, create a new folder and then name the folder after the file that was tested?
The Path class in the python builtin library pathlib is great at this, and working with files/folder locations in general. With glob(pattern: str), you can yield all matches to a particular file pattern in a directory, and iterate over those matches.
https://docs.python.org/3.9/library/pathlib.html#pathlib.Path.glob
You can also use Path to grab the name of the file and create a new directory to place your outputted csvs.
The file below assumes it is in the same directory as all of the original csvs, but that is changeable. I call that directory base_dir, equivalent to what you listed as C:/users/path/
/users/path/main.py:
from pathlib import Path
import pandas as pd
import numpy as np
failed_csv = 'Failed.csv'
passed_csv = 'Passed.csv'
import_csv = 'Import.csv'
dup_csv = 'duplicated.csv'
def get_root() -> Path:
return Path(__file__).resolve().parent
def process(csv_file: Path, out_dir: Path) -> None:
data = pd.read_csv(csv_file, encoding='latin-1')
###
### Do existing processing of data DataFrame
###
# Save files. These print statements will show the final
# file path for each of the output csvs.
print(out_dir / failed_csv) # '/users/path/my_file/Failed.csv'
print(out_dir / passed_csv) # '/users/path/my_file/Passed.csv'
print(out_dir / import_csv) # '/users/path/my_file/Import.csv'
print(out_dir / dup_csv) # '/users/path/my_file/duplicated.csv'
failed.to_csv(out_dir / failed_csv, index=False)
passed.to_csv(out_dir / passed_csv, index=False)
newimpomatic.to_csv(out_dir / import_csv, index=False)
duplicated.to_csv(out_dir / dup_csv, index=False)
def main(base_dir: Path) -> None:
print(f'Processing files in {base_dir}: \n')
n_process = 0
for csv_file in base_dir.glob('*.csv'):
# ex. csv_file = "/users/path/my_file.csv"
name: str = csv_file.stem # name = "my_file"
output_dir: Path = base_dir / name # output_dir = "/users/path/my_file"
print(f'Creating directory "{output_dir}"')
Path.mkdir(output_dir, exist_ok=True)
print(f'Processing "{csv_file}"')
process(csv_file=csv_file, out_dir=output_dir)
print(f'Completed processing\n')
n_process += 1
print(f'\nProcessed {n_process} files')
if __name__ == '__main__':
root = get_root() # root = "users/path"
main(base_dir=root)

How do I automate appends from new files created, or changes in files to a csv in Python?

I have a directory where i have xls files which are being updated and new xls files are being added.
I have combined all of the data using-
import os
import pandas as pd
def read_sheets(filename):
result = []
sheets = pd.read_excel(filename, sheet_name=None)
for name, sheet in sheets.items():
sheet['Sheetname'] = name
sheet['Row'] = sheet.index
result.append(sheet)
return pd.concat(result, ignore_index=True)
def read_files(filenames):
result = []
for filename in filenames:
file = read_sheets(filename)
file['Filename'] = filename
result.append(file)
return pd.concat(result, ignore_index=True)
files = ['1.xls', '2.xls','3.xls','4.xls','5.xls']
dfoo = read_files(files)
But I want to know if any changes are being made to these xls files, how can I automate appends to dfoo from these files and if new files are created lets say 6.xls or 7.xls later(which will have same column headers) how can that data also be appended to dfoo

Importing Multiple HTML Files Into Excel as Separate Worksheets

I have a number of HTML files that I need to open up or import into a single Excel Workbook and simply save the Workbook. Each HTML file should be on its own Worksheet inside the Workbook.
My existing code does not work and it crashes on the workbook.Open(html) line and probably will on following lines. I can't find anything searching the web specific to this topic.
import win32com.client as win32
import pathlib as path
def save_html_files_to_worksheets(read_directory):
read_path = path.Path(read_directory)
save_path = read_path.joinpath('Single_Workbook_Containing_HTML_Files.xlsx')
excel_app = win32.gencache.EnsureDispatch('Excel.Application')
workbook = excel_app.Workbooks.Add() # create a new excel workbook
indx = 1 # used to add new worksheets dependent on number of html files
for html in read_path.glob('*.html'): # loop through directory getting html files
workbook.Open(html) # open the html in the newly created workbook - this doesn't work though
worksheet = workbook.Worksheets(indx) # each iteration in loop add new worksheet
worksheet.Name = 'Test' + str(indx) # name added worksheets
indx += 1
workbook.SaveAs(str(save_path), 51) # win32com requires string like path, 51 is xlsx extension
excel_app.Application.Quit()
save_html_files_to_worksheets(r'C:\Users\<UserName>\Desktop\HTML_FOLDER')
The following code does half of want I want, if this helps. It will convert each HTML file into a separate Excel file. I need each HTML file in one Excel file with multiple WorkSheets.
import win32com.client as win32
import pathlib as path
def save_as_xlsx(read_directory):
read_path = path.Path(read_directory)
excel_app = win32.gencache.EnsureDispatch('Excel.Application')
for html in read_path.glob('*.html'):
save_path = read_path.joinpath(html.stem + '.xlsx')
wb = excel_app.Workbooks.Open(html)
wb.SaveAs(str(save_path), 51)
excel_app.Application.Quit()
save_as_xlsx(r'C:\Users\<UserName>\Desktop\HTML_FOLDER')
Here is a link to a sample HTML file you can use, the data in the file is not real: HTML Download Link
One solution would be to open the HTML file into a temporary workbook, and copy the sheet from there into the workbook containing all of them:
workbook = excel_app.Application.Workbooks.Add()
sheet = workbook.Sheets(1)
for path in read_path.glob('*.html'):
workbook_tmp = excel_app.Application.Workbooks.Open(path)
workbook_tmp.Sheets(1).Copy(Before=sheet)
workbook_tmp.Close()
# Remove the redundant 'Sheet1'
excel_app.Application.ShowAlerts = False
sheet.Delete()
excel_app.Application.ShowAlerts = True
I believe pandas will make your job much easier.
pip install pandas
Here's an example on how to get multiple tables from a wikipedia html and input it into a Pandas DataFrame and save it to disk.
import pandas as pd
url = "https://en.wikipedia.org/wiki/List_of_American_films_of_2017"
wikitables = pd.read_html(url, header=0, attrs={"class":"wikitable"})
for idx,df in enumerate(wikitables):
df.to_csv('{}.csv'.format(idx),index=False)
For your use case, something like this should work:
import pathlib as path
import pandas as pd
def save_as_xlsx(read_directory):
read_path = path.Path(read_directory)
for html in read_path.glob('*.html'):
save_path = read_path.joinpath(html.stem + '.xlsx')
dfs_from_html = pd.read_html(html, header=0,)
for idx, df in enumerate(dfs_from_html):
df.to_excel('{}.xlsx'.format(idx),index=False)
** Make sure to set the correct html attribute in the pd.read_html function.
How about this?
Sub From_XML_To_XL()
'UpdatebyKutoolsforExcel20151214
Dim xWb As Workbook
Dim xSWb As Workbook
Dim xStrPath As String
Dim xFileDialog As FileDialog
Dim xFile As String
Dim xCount As Long
On Error GoTo ErrHandler
Set xFileDialog = Application.FileDialog(msoFileDialogFolderPicker)
xFileDialog.AllowMultiSelect = False
xFileDialog.Title = "Select a folder [Kutools for Excel]"
If xFileDialog.Show = -1 Then
xStrPath = xFileDialog.SelectedItems(1)
End If
If xStrPath = "" Then Exit Sub
Application.ScreenUpdating = False
Set xSWb = ThisWorkbook
xCount = 1
xFile = Dir(xStrPath & "\*.xml")
Do While xFile <> ""
Set xWb = Workbooks.OpenXML(xStrPath & "\" & xFile)
xWb.Sheets(1).UsedRange.Copy xSWb.Sheets(1).Cells(xCount, 1)
xWb.Close False
xCount = xSWb.Sheets(1).UsedRange.Rows.Count + 2
xFile = Dir()
Loop
Application.ScreenUpdating = True
xSWb.Save
Exit Sub
ErrHandler:
MsgBox "no files xml", , "Kutools for Excel"
End Sub

Adding Multiple .xls files to a Single .xls file, using the file name to name tabs

I have multiple directories, each of which containing any number of .xls files.
I'd like to take the files in any given directory and combine them into one .xls file, using the file names as the tab names.
For example if there are the files NAME.xls, AGE.xls, LOCATION.xls, I'd like to combine them into a new file with the data from NAME.xls on a tab called NAME, the data from AGE.xls on a tab called AGE and so on.
Each source .xls file only has one column of data with no headers.
This is what I have so far, and well it's not working.
Any help would be greatly appreciated (I'm fairly new to Python and I've never had to do anything like this before).
wkbk = xlwt.Workbook()
xlsfiles = glob.glob(os.path.join(path, "*.xls"))
onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
tabNames = []
for OF in onlyfiles:
if str(OF)[-4:] == ".xls":
sheetName = str(OF)[:-4]
tabNames.append(sheetName)
else:
pass
for TN in tabNames:
outsheet = wkbk.add_sheet(str(TN))
data = pd.read_excel(path + "\\" + TN + ".xls", sheet_name="data")
data.to_excel(path + "\\" + "Combined" + ".xls", sheet_name = str(TN))
Here is a small helper function - it supports both .xls and .xlsx files:
import pandas as pd
try:
from pathlib import Path
except ImportError: # Python 2
from pathlib2 import Path
def merge_excel_files(dir_name, out_filename='result.xlsx', **kwargs):
p = Path(dir_name)
with pd.ExcelWriter(out_filename) as xls:
_ = [pd.read_excel(f, header=None, **kwargs)
.to_excel(xls, sheet_name=f.stem, index=False, header=None)
for f in p.glob('*.xls*')]
Usage:
merge_excel_files(r'D:\temp\xls_directory', 'd:/temp/out.xls')
merge_excel_files(r'D:\temp\xlsx_directory', 'd:/temp/out.xlsx')
Can you try
import pandas as pd
import glob
path = 'YourPath\ToYour\Files\\' # Note the \\ at the end
# Create a list with only .xls files
list_xls = glob.glob1(path,"*.xls")
# Create a writer for pandas
writer = pd.ExcelWriter(path + "Combined.xls", engine = 'xlwt')
# Loop on all the files
for xls_file in list_xls:
# Read the xls file and the sheet named data
df_data = pd.read_excel(io = path + xls_file, sheet_name="data")
# Are the sheet containing data in all your xls file named "data" ?
# Write the data into a sheet named after the file
df_data.to_excel(writer, sheet_name = xls_file[:-4])
# Save and close your Combined.xls
writer.save()
writer.close()
Let me know if it works for you, I never tried engine = 'xlwt' as I don't work with .xls file but .xlsx

Improving python code for updating google spreadsheets

First of all, I am new to python (practically I have learned only from Sololearn, that too only up to half course). So I request you to give me a little bit detailed answer.
My task has following broad steps:-
Delete old .xlsx file(if any)
Convert two .xls files into .xlsx file using win32, delete the first row and then delete .xls file [weird xls files already downloaded into source directory + xlrd,pyexcel show error (unsupported format or corrupt) file in opening .xls file (online analysis of file predicts it to be html/htm) ]
Get data from xlsx file
First, delete old worksheet on google spreadsheet to remove old data. Create a new worksheet with the same name. Insert data into new worksheet on the google spreadsheet.
Open second sheet(which imports data from the first sheet) and update one cell in Dummy Sheet to make sure google spreadsheet is synchronised in the background.
Now, I wrote a code by combining many codes and by using a lot of google.
The code is working fine but it takes on an avg about 65 seconds to complete the whole process.
My question has 3 parts:-
Is there any way to directly access data from .xls file?
Is there any way this code's performance can be improved.
Any other more efficient method for completing the above-said task?
My Code:-
import time
import win32com.client as win32
import os
import openpyxl
from openpyxl.utils import get_column_letter
import gspread
from oauth2client.service_account import ServiceAccountCredentials
start = time.time()
# set input-output file locations
source_dir = "C:\\Users\\XYZ\\Downloads"
output_dir = "C:\\Users\\XYZ\\Excels"
# use creds to create a client to interact with the Google Drive API
# make sure to share files with email contained in json file
scope = ['https://spreadsheets.google.com/feeds']
# code will not work without json file
creds = ServiceAccountCredentials.from_json_keyfile_name("C:\\Users\\XYZ\\your.json", scope)
gc = gspread.authorize(creds)
# following code is to open any spreadsheet by name
sh = gc.open("First Sheet")
def save_as_xlsx(input_file,output_dir,output_file_name) :
# call excel using win32, then open .xls file
# delete first row and then save as .xlsx
excel = win32.gencache.EnsureDispatch('Excel.Application')
wb = excel.Workbooks.Open(input_file)
wbk = excel.ActiveWorkbook
sheet = wbk.Sheets(1)
sheet.Rows(1).Delete()
wb.SaveAs(output_dir + '\\' + output_file_name, FileFormat = 51)
#FileFormat = 51 is for .xlsx extension. FileFormat = 56 is for .xls extension
wb.Close()
excel.Application.Quit()
return True
def get_the_data_from_xlsx(output_dir,output_file_name) :
# use openpyxl.load to find out last cell of file
# store cell values in list called data
wb = openpyxl.load_workbook(output_dir + '\\' + output_file_name)
sheet = wb.active
max_row_no = sheet.max_row
max_column_no = sheet.max_column
max_column = get_column_letter(max_column_no)
last_cell = str(max_column) + str(max_row_no)
cell_addresses = sheet['A1' : last_cell]
data = []
for i in cell_addresses :
for e in i :
data.append(e.value)
return (data,last_cell)
def insert_data_into_spreadsheet(name_of_worksheet,data,last_cell) :
# Find a workbook by name in already opened spreadsheet
# delete the worksheet to clear old data
# create worksheet with same name to maintain import connections in sheets.
worksheet = sh.worksheet(name_of_worksheet)
sh.del_worksheet(worksheet)
worksheet = sh.add_worksheet(title=name_of_worksheet, rows="500", cols="30")
# store range of cells for spreadsheet in list named cell_list
cell_list = worksheet.range('A1' + ':' + str(last_cell))
# attach all the values from data list as per the cell_list
a = 0
for cell in cell_list :
cell.value = data[a]
a = a + 1
# update all cells stored in cell_list in one go
worksheet.update_cells(cell_list)
def delete_file(directory,file_initials) :
for filename in os.listdir(directory) :
if filename.startswith(file_initials) :
os.unlink(directory +"\\" + filename)
# check if files are in source_dir
for filename in os.listdir(source_dir) :
# check for file1.xls and set input_file name if any file exists.
if filename.startswith("file1"):
input_file = source_dir + "\\file1.xls"
output_file1 = "output_file1.xlsx"
# detect and delete any old file in output directory
delete_file(output_dir,"output_file1")
if save_as_xlsx(input_file,output_dir,output_file1) == True :
# delete the file from source directory after work is done
delete_file(source_dir,'file1')
# get data from new xlsx file
data_from_xlsx = get_the_data_from_xlsx(output_dir,output_file1)
data_to_spreadsheet = data_from_xlsx[0]
last_cell = data_from_xlsx[1]
# insert updated data into spreadsheet
insert_data_into_spreadsheet("file1_data",data_to_spreadsheet,last_cell)
# repeat the same process for 2nd file
if filename.startswith('file2'):
input_file = source_dir + "\\file2.xls"
output_file2 = "output_file2.xlsx"
delete_file(output_dir,"output_file2")
if save_as_xlsx(input_file,output_dir,output_file2) == True :
delete_file(source_dir,'file2')
data_from_xlsx = get_the_data_from_xlsx(output_dir,output_file2)
data_to_spreadsheet = data_from_xlsx[0]
last_cell = data_from_xlsx[1]
insert_data_into_spreadsheet("file2_data",data_to_spreadsheet,last_cell)
# open spreadsheet by name and open Dummy worksheet
# update one cell to sync the sheet with other sheets
sh = gc.open("second sheet")
worksheet = sh.worksheet("Dummy")
worksheet.update_acell('B1', '=Today()')
end = time.time()
print(end-start)

Categories

Resources