I have a column called Soru-TR. There are two Turkish data in the column. What I want to do is to translate the data in the Soru-TR column and then save it under the Soru-EN column.
The error I get in the output.
Traceback (most recent call last):
File "C:\Users\User1\Desktop\test.py", line 17, in <module>
df.insert(1, "Soru-EN", output2)
File "C:\Users\User1\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\frame.py", line 4443, in insert
raise ValueError(f"cannot insert {column}, already exists")
ValueError: cannot insert Soru-EN, already exists
before running the code
after running the code
import os
from typing import List
from openpyxl import load_workbook
import pandas as pd
from google.cloud import translate_v2
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r"C:\Users\User1\Desktop\translate-598740482087.json"
translate_client = translate_v2.Client()
target = "en"
df = pd.read_excel('file.xlsx')
for i in df.index:
x = df['Soru-TR'][i]
output = translate_client.translate(x, target_language=target)
output2 = output['translatedText']
df.insert(1, "Soru-EN", output2)
with pd.ExcelWriter('file.xlsx', mode='a', engine="openpyxl", if_sheet_exists='overlay') as writer:
df.to_excel(writer, sheet_name='Sayfa1', index=False)
with pd.ExcelWriter('file.xlsx', mode='a', engine="openpyxl", if_sheet_exists='overlay') as writer:
df.to_excel(writer, sheet_name='Sayfa1', index=False)
try this
df = pd.read_excel('file.xlsx')
df["Soru-EN"] = "Pending" # add the column "Soru-EN" with value "Pending" for all rows
for i in df.index:
x = df.loc[i,'Soru-TR'] # tip: using pandas.loc[] to access values is better.
output = translate_client.translate(x, target_language=target)
output2 = output['translatedText']
df.loc[i, "Soru-EN"] = output2 # assign the translated text to its equivalent cell.
Hope this helps.
Related
How do i make my df.to_excel function write to an output path? After my script runs, I do not see the files in the output_path directory i have defined.
import pandas as pd
from openpyxl import load_workbook
import os
import datetime
output_path = 'C:/Users/g/Desktop/autotranscribe/python/Processed'
path = 'C:/Users/g/Desktop/autotranscribe/python/Matching'
cols_to_drop = ['PSI ID','PSIvet Region','PSIvet region num','Fax','County']
column_name_update_map = {'Account name': 'Company Name','Billing address':'Address','Billing city':'City','Billing State':'State'}
for file in os.listdir("C:/Users/g/Desktop/autotranscribe/python/Matching"):
if file.startswith("PSI") and "(updated headers)" not in file:
dfs = pd.read_excel(file, sheet_name=None,skiprows=5)
output = dict()
for ws, df in dfs.items():
if ws.startswith("Cancelled Members"): df = df.drop('Active date', axis=1)
if any(ws.startswith(x) for x in ["New Members","PVCC"]):
continue
#if ws in ["New Members 03.22","PVCC"]: #sheetstoavoid
temp = df
dt = pd.to_datetime(os.path.getctime(os.path.join(path,file)),unit="s").replace(nanosecond=0)
output[ws] = temp
writer = pd.ExcelWriter(f'{file.replace(".xlsx","")} (updated headers).xlsx')
for ws, df in output.items():
df.to_excel(writer, index=None, sheet_name=ws)
writer.save()
writer.close()
I tried df.to_excel(writer,output_path, index=None, sheet_name=ws)
But i get an error
File "", line 36, in
df.to_excel(writer,output_path, index=None, sheet_name=ws)
TypeError: to_excel() got multiple values for argument 'sheet_name'.
A few comments:
The function os.listdir() only returns "unqualified" file names, so before using file, we need to prepend path using something like input_file_name = f'{path}/{file}'.
Similarly, pd.ExcelWriter() will need a qualified file name (that is, including the path as well as the "unqualified" file name), which we can get by doing this: output_file_name = f'{output_path}/{file.replace(".xlsx","")} (updated headers).xlsx'.
There are some elements of the code in your question that may not be getting used, but rather than comment on or change those, I provide a working version with minimal changes below.
I created directories named Matching and Processed. I placed a file named PSI 123.xlsx in Matching with a tab named Cancelled Members containing the following:
will skip
will skip
will skip
will skip
will skip
Col1 Col2 Col3 Active date
xx NY 110 789
I then ran the following modification to your code (note the changes to output_path and path for testing purposes in my environment):
import pandas as pd
from openpyxl import load_workbook
import os
import datetime
#output_path = 'C:/Users/g/Desktop/autotranscribe/python/Processed'
#path = 'C:/Users/g/Desktop/autotranscribe/python/Matching'
output_path = './Processed'
path = './Matching'
cols_to_drop = ['PSI ID','PSIvet Region','PSIvet region num','Fax','County']
column_name_update_map = {'Account name': 'Company Name','Billing address':'Address','Billing city':'City','Billing State':'State'}
for file in os.listdir(path):
if file.startswith("PSI") and "(updated headers)" not in file:
input_file_name = f'{path}/{file}'
dfs = pd.read_excel(input_file_name, sheet_name=None,skiprows=5)
output = dict()
for ws, df in dfs.items():
if ws.startswith("Cancelled Members") and 'Active date' in df.columns: df = df.drop('Active date', axis=1)
if any(ws.startswith(x) for x in ["New Members","PVCC"]):
continue
#if ws in ["New Members 03.22","PVCC"]: #sheetstoavoid
temp = df
dt = pd.to_datetime(os.path.getctime(os.path.join(path,file)),unit="s").replace(nanosecond=0)
output[ws] = temp
output_file_name = f'{output_path}/{file.replace(".xlsx","")} (updated headers).xlsx'
writer = pd.ExcelWriter(output_file_name)
for ws, df in output.items():
df.to_excel(writer, index=None, sheet_name=ws)
writer.save()
writer.close()
After running, the code had created a new file in Processed named PSI 123 (updated headers).xlsx with sheets named as in the input. The sheet Cancelled Members contained the following:
Address State Zip Status Status.1 Date Partner
Col1 Col2 Col3
xx NY 110
I am trying to save a pandas dataframe as an excel table to a sharepoint site. I have two separate blocks of code which achieve the below.(thanks for Stackoverflow community)
A script which can save a pandas df as excel table using ExcelWriter on local storage.
A Script which can save a local file to Sharepoint online.
I am very confused on how to combine these two to save a df to sharepoint online but the excel file should be a table not just a range of data. Kindly help
SCRIPT 1 to save excel range as table on local
##############################################################################
#
# An example of adding a dataframe to an worksheet table in an xlsx file
# using Pandas and XlsxWriter.
#
# Tables in Excel are used to group rows and columns of data into a single
# structure that can be referenced in a formula or formatted collectively.
#
# SPDX-License-Identifier: BSD-2-Clause
# Copyright 2013-2021, John McNamara, jmcnamara#cpan.org
#
import pandas as pd
# Create a Pandas dataframe from some data.
df = pd.DataFrame({
'Country': ['China', 'India', 'United States', 'Indonesia'],
'Population': [1404338840, 1366938189, 330267887, 269603400],
'Rank': [1, 2, 3, 4]})
# Order the columns if necessary.
df = df[['Rank', 'Country', 'Population']]
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('pandas_table.xlsx', engine='xlsxwriter')
# Write the dataframe data to XlsxWriter. Turn off the default header and
# index and skip one row to allow us to insert a user defined header.
df.to_excel(writer, sheet_name='Sheet1', startrow=1, header=False, index=False)
# Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
# Get the dimensions of the dataframe.
(max_row, max_col) = df.shape
# Create a list of column headers, to use in add_table().
column_settings = [{'header': column} for column in df.columns]
# Add the Excel table structure. Pandas will add the data.
worksheet.add_table(0, 0, max_row, max_col - 1, {'columns': column_settings})
# Make the columns wider for clarity.
worksheet.set_column(0, max_col - 1, 12)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
SCRIPT 2 to save any file to sharepoint
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
import os
baseurl = 'https://testsite.sharepoint.com/'
basesite = '/sites/project' # every share point has a home.
siteurl = baseurl + basesite
localpath = "pandas_table.xlsx"
remotepath = "Shared Documents/General/file.xlsx" # existing folder path under sharepoint site.
ctx_auth = AuthenticationContext(siteurl)
ctx_auth.acquire_token_for_user(<username>, <password>)
ctx = ClientContext(siteurl, ctx_auth) # make sure you auth to the siteurl.
with open(localpath, 'rb') as content_file:
file_content = content_file.read()
dir, name = os.path.split(remotepath)
file = ctx.web.get_folder_by_server_relative_url(dir).upload_file(name, file_content).execute_query()
I am not really sure how I can use the writer.save() with the sharepoint connector. Kindly advise
thanks in advance.
Given that you have your data in a df the following code will write to sharepoint using the O365 library.
from io import BytesIO
from tempfile import gettempdir
from O365 import Account, FileSystemTokenBackend
import pandas as pd
O365_CLIENT_ID = "client"
O365_SECRET = "secret"
O365_TENANT_ID = "<name>"
O365_SHAREPOINT = "<name>.sharepoint.com"
O365_SITE = "/sites/..."
def save_file(folder_path, filename, data):
"""save file to O365."""
account = Account(
(O365_CLIENT_ID, O365_SECRET),
auth_flow_type="credentials",
tenant_id=O365_TENANT_ID,
token_backend=FileSystemTokenBackend(
token_path=gettempdir(), token_filename="o365_token.txt"
),
)
if account.authenticate():
drive = (
account.sharepoint()
.get_site(O365_SHAREPOINT, O365_SITE)
.get_default_document_library()
)
subfolders = folder_path.split("/")
if len(subfolders) != 0:
items = drive.get_items()
for subfolder in subfolders:
try:
subfolder_drive = list(filter(lambda x, sf=subfolder: sf in x.name, items))[0]
items = subfolder_drive.get_items()
except Exception as excep: # pylint: disable=broad-except
raise f"Path {folder_path} does not exist." from excep
else:
subfolder_drive = drive.get_root_folder()
subfolder_drive.upload_file(
item=None,
item_name=filename,
stream=data,
stream_size=data.getbuffer().nbytes,
)
with BytesIO() as buf:
df.to_excel(buf, index=False)
buf.seek(0)
save_file('folder/to/upload/to', 'filename.xlsx', buf)
Writing to multiple sheets:
with BytesIO() as buf:
with pd.ExcelWriter( # pylint: disable=abstract-class-instantiated
buf,
engine="xlsxwriter",
) as writer:
df.to_excel(
writer,
sheet_name='sheet1',
index=False,
)
df.to_excel(
writer,
sheet_name='sheet2',
index=False,
)
buf.seek(0)
save_file('folder/to/upload/to', 'filename.xlsx', buf)
How can I append a row at the top of an excel sheet? Goal as follows:
The file itself is written by using pandas.df.to_excel as follows:
import pandas
with pandas.ExcelWriter(output_filename) as writer:
for file in files:
df = pandas.read_csv(file)
df.to_excel(writer, sheet_name=file.replace(".csv", "").replace("_", " ").title(), index=False)
Here is one way to do it using XlsxWriter as the Excel engine:
with pandas.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
for file in files:
df = pandas.read_csv(file)
sheet_name = file.replace(".csv", "").replace("_", " ").title()
df.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1)
worksheet = writer.sheets[sheet_name]
worksheet.write('A1', 'Here is some additional text')
You can use openpyxl to edit your Excel file afterwards:
import contextlib
import openpyxl
import pandas as pd
new_row = "THIS ROW IS APPENDED AFTER THE FILE IS WRITTEN BY PANDAS"
with contextlib.closing(openpyxl.open(output_filename)) as wb:
for file in files:
sheet_name = file.replace(".csv", "").replace("_", " ").title()
sheet = wb[sheet_name]
sheet.insert_rows(0)
sheet["A1"] = new_row
wb.save(output_filename)
my propose:
if excel file not exist, create it and copy data table to it;
if excel file exist, copy to data table to new sheet.
but following code running, only copy to data to new sheet, original sheet in excel file was removed.
import os
import pandas as pd
import openpyxl
f_name = "123.xlsx" #target excel file
if os.path.exists(f_name):
"""if excel file exist, added table to another sheet"""
wb = openpyxl.load_workbook(f_name) #load excel file
writer = pd.ExcelWriter(f_name, engine="openpyxl")
writer.wb = wb
df = pd.DataFrame(pd.read_excel("table_2.xlsx")) #get table to be added excel file
df.to_excel(writer, sheet_name="sheet2",index=False) #write to another sheet
writer.save()
writer.close()
else:
"""if excel file not exit, create it"""
df_1 = pd.DataFrame() # create excel file
df_1.to_excel(f_name)
writer = pd.ExcelWriter(f_name)
df_2 = pd.DataFrame(pd.read_excel("table_1.xlsx")) # get table_1
df_2.to_excel(writer, sheet_name="sheet1",index=False) # write table_1 into excel file
writer.save()
writer.close()
import os
import pandas as pd
import openpyxl
f_name = "123.xlsx" #target excel file
if os.path.exists(f_name):
"""if excel file exist, added table to another sheet"""
wb = openpyxl.load_workbook(f_name) #load excel file
writer = pd.ExcelWriter(f_name, engine="openpyxl") #assign engine
writer.book = wb #overwrite if no this
df = pd.DataFrame(pd.read_excel("table_2.xlsx")) #get table to be added excel file
df.to_excel(writer, sheet_name="table_2",index=False) #write to another sheet
writer.save()
writer.close()
else:
"""if excel file not exit, create it"""
df_1 = pd.DataFrame() # create excel file
df_1.to_excel(f_name)
writer = pd.ExcelWriter(f_name)
df_2 = pd.DataFrame(pd.read_excel("table_1.xlsx")) # get table_1
df_2.to_excel(writer, sheet_name="table_1",index=False) # write table_1 into excel file
writer.save()
writer.close()
Can someone please help me here? I do not get any output and I do not get either an error message. I am trying to filter a dataframe into multiple sub set using customer conditions and paste each sub set into Excel worksheets.
Master_data(df) Output A Output B
import pandas as pd
import os
## Belgium\2020\GMC Prep Automation")
from openpyxl import load_workbook
import xlsxwriter
from shutil import copyfile
file = input("please enter excelfile: ")
extension = os.path.splitext(file)[1]
filename = os.path.splitext(file)[0]
pth = "\\we.interbrew.net\\DFSEurope\\Crown Jewels\\Revenue Management\\WEST\\2. BE\\4. MPM Belgium\\2020\\GMC Prep Automation"
newfile = os.path.join(pth, filename+"_2"+extension)
#myfile = os.path.join(pth, Split_Test.xlsx)
df = pd.read_excel(file)
colpick = input("enter column to be splitted: ")
col = list(set(df[colpick].values))
def sendtoexcel(col):
copyfile(file, newfile)
for j in col:
writer = pd.ExcelWriter(newfile,engine='openpyxl')
for myname in col:
mydf=df.loc[df[colpick] == myname]
mydf.to_excel(writer,sheet_name=myname,index=False)
writer.save()
print("\nCompleted")
return
Assuming user inputs correct file names and existing column, consider groupby run and not a double for loop on the same column. Code is wrapped in try/except in case user enters an incorrect column name or some issue with exporting data fame to Excel.
from openpyxl import load_workbook
...
colpick = input("enter column to be splitted: ")
colpick = colpick.title().strip()
def sendtoexcel():
try:
with pd.ExcelWriter(file, engine='openpyxl') as writer:
writer.book = load_workbook(file)
for i, sub in df.groupby([colpick]):
sub.to_excel(writer, sheet_name=i, index=False)
writer.save()
except Exception as e:
print(e)
# ACTUALLY RUN FUNCTION
sendtoexcel()