I want to export my file as CSV file. but when I open the CSV file with excel the appearance doesn't show as columnar view and show as a one string and all filelds concat together.
what is the problem: the picture is my goal.(Tabular format)
this is my code:
first I export the result from my database:
import cx_Oracle
query = """select * from test"""
db = cx_Oracle.connect(conn_str,encoding="UTF-8")
curs = db.cursor()
curs.execute(query)
result = curs.fetchall()
then I exort file in local os and the via ftp I upload in destination folder in another server:
import pandas as pd
from datetime import datetime,timedelta
import ftplib
df = pd.DataFrame(result)
df = df.rename_axis(None)
df.to_csv('C:\\test\\test.csv',index=False,header=False,
encoding='utf-16',sep = ',')
today = str(datetime.now() - timedelta(1))[:10]
today = today.split('-')
final_today = today[1]+today[2]
outputName = 'test%s.csv'%final_today
session = ftplib.FTP('1.1.1.1','test', 'test')
Output_Directory = '/test'
session.cwd(Output_Directory)
fh = open('C:\\tets\\test.csv','rb')
session.storbinary('STOR '+ outputName, fh) # send the file
fh.close() # close file and FTP
session.quit()
the problem solved by considering sep = '\t' in to_csv function.
Related
I am basically trying to export a parquet file inside GCS cloud bucket as shown below in my code which is a GCP cloud function where i am getting error in the line "chunk.to_parquet(parquet_file_path, engine='fastparquet', compression='snappy')" saying -" No such file or directory: 'new_folder_20230206_065500/table1-20230206_065638.parquet". The folder is getting created successfully inside bucket but i am not sure why parquet file is not getting generated inside it.
import mysql.connector
import pandas as pd
from google.cloud import storage
from datetime import datetime, timedelta
import os
def extract_data_to_gcs(request):
connection = mysql.connector.connect(
host=os.getenv('..'),
user=os.getenv('...'),
password=os.getenv('...'),
database='....'
)
cursor = connection.cursor(buffered=True)
tables = ["table1", "table2", "table3"]
client = storage.Client()
bucket = client.bucket('data-lake-archive')
# Create a timestamp-based folder name
now = datetime.now()
folder_name = now.strftime("new_folder_%Y%m%d_%H%M%S")
folder_path = f"{folder_name}/"
# Create the folder in the GCS bucket
blob = bucket.blob(folder_path)
blob.upload_from_string("", content_type="application/octet-stream")
for table in tables:
cursor.execute("SELECT * FROM {}".format(table))
chunks = pd.read_sql_query("SELECT * FROM {}".format(table), connection, chunksize=5000000)
for i, chunk in enumerate(chunks):
chunk.columns = [str(col) for col in chunk.columns]
ingestion_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
parquet_file_path = folder_path + f"{table}-{i}.parquet"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# parquet_file_path = folder_path + f'abc.parquet'
print(f'folder path is {folder_path}')
print(f'parquet file path is {parquet_file_path}')
chunk.to_parquet(parquet_file_path, engine='fastparquet', compression='snappy')
# blob = bucket.blob(folder_path + f'{table}-{i}.parquet')
# blob.upload_from_filename(folder_path + f'{table}-{i}.parquet')
cursor.execute("SELECT table_name, column_name FROM information_schema.key_column_usage WHERE referenced_table_name = '{}'".format(table))
referenced_tables = cursor.fetchall()
for referenced_table in referenced_tables:
chunks = pd.read_sql_query("SELECT * FROM {}".format(referenced_table[0]), connection, chunksize=5000000)
for i, chunk in enumerate(chunks):
chunk.columns = [str(col) for col in chunk.columns]
ingestion_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
chunk.to_parquet(f"{folder_path}{referenced_table[0]}-{ingestion_timestamp}-{i}.parquet", engine='fastparquet', compression='snappy')
blob = bucket.blob(folder_path + f'{referenced_table[0]}-{ingestion_timestamp}-{i}.parquet')
blob.upload_from_filename(folder_path + f'{referenced_table[0]}-{ingestion_timestamp}-{i}.parquet')
return 'Data extracted and uploaded to GCS'
Do you need to create the folder first? I'm not familiar with Google Cloud, but that might be a cause of the issue. folder_path = f"{folder_name}/" Create this folder before doing, chunk.to_parquet(...)
Where exactly are the errors thrown? There are two lines with chunk.to_parquet(). Can you reduce the error down to a specific line?
I am trying to figure out why when I export a database view from a MSSQL database it results in a csv file which only contains the view's header columns. For example, that is the resulting file that I am currently getting:
"",""
"RIG ID","Date - Rig"
"RIG ID","Date - Rig"
"RIG ID","Date - Rig"
...
That is the export code that I am running:
import csv
import os
import pyodbc
# Rig db Params/Vars
# Rigs data csv file path and name.
filePath = os.getcwd() + '/'
fileName = 'export.csv'
...
# SQL to select data from the rigs table.
rigs_export_sql = "SELECT TOP 10 'RIG ID', 'Date - Rig' FROM schema_name.view_name"
def export_rigs_data():
# Database connection variable.
connect = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=' +
server+';DATABASE='+database+';UID='+username+';PWD=' + password)
# Cursor to execute query.
cursor = connect.cursor()
# Execute query.
cursor.execute(rigs_export_sql)
# Fetch the data returned.
results = cursor.fetchall()
# Extract the table headers.
headers = [i[0] for i in cursor.description]
# Open CSV file for writing.
csvFile = csv.writer(open(filePath + fileName, 'w', newline=''),
delimiter=',', lineterminator='\r\n',
quoting=csv.QUOTE_ALL, escapechar='\\')
# Add the headers and data to the CSV file.
csvFile.writerow(headers)
csvFile.writerows(results)
if __name__ == "__main__":
export_rigs_data()
What I am missing?
Note: when I execute the the following query "SELECT TOP 10 * FROM schema_name.view_name I am getting all of the data and header fine
I am working on a project where I want to import a huge number of data in arangodb. All the data are in .xlsx form with multiple worksheets. So I wrote a script that converts the .xlsx files to json files (one json file for each worksheet) and then establishes a connection with arango db and does bulk import of data in arangodb. So I wrote the script in jupyter notebook installed in a windows pc with the latest Anaconda version and it works like a charm with either local or remote database connections. So after I saw that the code works I copied the script to my CentOS7 virtual server and ran it and it crashed. I ran it in a physical machine with ubuntu 19.10 and it crashed also. Both linux machines were updated, and worked also with the latest Anaconda version. Also the script was ran both in command line as .py file and as .ipynb file from jupyter notebook in all the machines (windows and linux). In windows it works perfectly in linux it crashes when it starts to convert the first .xlsx file. The code for the script is this:
from zipfile import ZipFile
from bs4 import BeautifulSoup
import pandas as pd
from xlsx2csv import Xlsx2csv as x2csv
import os
import hashlib
import json
import numpy as np
from arango import ArangoClient
import glob
filelist = []
hash_dict = {}
current_folder = os.getcwd()
for file in os.listdir(current_folder):
if file.endswith(".xlsx"):
filelist.append(file)
#create a list of all worksheets contained in the worksheet
def create_sheet_list(file):
with ZipFile(file) as zipped_file:
summary = zipped_file.open(r'xl/workbook.xml').read()
soup = BeautifulSoup(summary, "xml")
sheets = [sheet.get("name") for sheet in soup.find_all("sheet")]
return sheets
#create an array of dataframes from all the worksheets
def create_dataframes(file):
xl = pd.ExcelFile(file)
xl.sheet_names
dfs = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
return dfs
def create_json(file,sheets,dfs):
print(("The file contains {} sheets").format(len(sheets)))
count = 0
for i in sheets:
json_filelist = []
count = count + 1
#produce the dataframe and check if there are any encoding errors
try:
df = dfs[i]
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header
df = df.fillna(0)
hash_str_name = file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_'))
hash_str=int(hashlib.sha1(hash_str_name.encode('utf-8')).hexdigest(), 16) % (10 ** 10)
values = str(hash_str)
df['Hash']=np.nan
df['Hash']= df['Hash'].fillna(value=values)
#hash_dict.update({hash_str_name : values})
hash_dict[hash_str_name] = values
json_file = df.reset_index().to_json(new_path+"/"+file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_')), orient = "records")
#For the dataframes that will get an error because of encoding a different way of conversion will be used
except UnicodeEncodeError:
x2csv(file, outputencoding="utf-8").convert(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')),count)
df = pd.read_csv(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')), header = 1)
hash_str_name = file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_'))
hash_str=int(hashlib.sha1(hash_str_name.encode('utf-8')).hexdigest(), 16) % (10 ** 10)
values = str(hash_str)
df['Hash']=np.nan
df['Hash']= df['Hash'].fillna(value=values)
#hash_dict.update({hash_str_name : values})
hash_dict[hash_str_name] = values
json_file = df.reset_index().to_json(new_path+"/"+file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_')), orient = "records")
os.remove(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')))
#Create connection with the Database
def create_db_connection():
client = ArangoClient(hosts='http://127.0.0.1:8529')
db = client.db('CEM', username='root', password='123456')
return db
#Get the list of the .json files from all the folders
def list_of_json():
path = os.getcwd()
folders = os.listdir(path)
json_names = []
for folder in folders:
files = glob.glob(path+"/"+folder+"/"+"*.json")
if len(files)>0:
json_names.append(files)
return json_names
#Get the list of the collections in the database
def list_of_collections(sheets,db):
for col in sheets:
col = col.replace(' ','_')
if db.has_collection(col):
collect = db.collection(col)
else:
collect = db.create_collection(col)
collection = db.collections()
collection = [i['name'] for i in collection if i['name'][0].isupper()]
return collection
#Import the data from the .json files to the appropriate collections
def import_data(json_names,collection, db):
for x in json_names:
for y in x:
for z in collection:
with open(y, "r") as json_file:
if y.endswith("{}.json".format(z)):
data = json.load(json_file)
z = db.collection(z)
z.import_bulk(data)
for file in filelist:
try:
#create the folder where the .json files from that UFED will be stored
new_folder = os.mkdir(os.getcwd()+"/"+file.strip('.xlsx'))
#get the path for the new folder
new_path = "{0}/{1}".format(os.getcwd(), file.strip('.xlsx'))
except FileExistsError:
#if the folder already exists just get its path
new_path = "{0}/{1}".format(os.getcwd(), file.strip('.xlsx'))
print(new_path)
#print the name of the file that's being analyzed so that we have a measure of progress
print(("Now I am working with {} file").format(file))
#call the functions and run the program
create_sheet_list(file)
create_dataframes(file)
sheets = create_sheet_list(file)
dfs = create_dataframes(file)
create_json(file,sheets,dfs)
df_dict = pd.DataFrame(list(hash_dict.items()), index = None, columns = ["File_name", "Hash_num"])
df_dict.to_json(current_folder+"/hash_list.json", orient = "records")
create_db_connection()
db = create_db_connection()
#create_collections(sheets,db)
list_of_json()
json_names = list_of_json()
list_of_collections(sheets,db)
collection = list_of_collections(sheets,db)
import_data(json_names,collection,db)
Can anyone help?
I am new to Python programming and seeking for some help/guidance in correcting my python code.
Here my query is.
I have one Excel file which has (7 Tabs).
I have one Folder which contains 7 different text file and each text file contains respective Tab SQL Query and each text file name is same as the Tab Name which is available in Excel File.
I have a written a Python code to loop through all the text file one by one and execute that each text file SQL query and whatever data will come in output that output data should dump into existing excel file in that respective sheet/tab. i am using pandas to do this, however, code is working fine but while updating data into excel pandas is removing all existing sheets from the file and updating only current output data into excel file.
Example: if Python code execute a text file(Filename: Data) and after executing this SQL query we got some data and this data should dump into excel file (sheetname: Data).
<pre><code>
import pypyodbc
import pandas as pd
import os
import ctypes
from pandas import ExcelWriter
fpath = r"C:\MNaveed\DataScience\Python Practice New\SQL Queries"
xlfile = r"C:\MNaveed\DataScience\Python Practice New\SQL Queries\Open_Case_Data.xlsx"
cnxn = pypyodbc.connect('Driver={SQL Server};Server=MyServerName;Database=MyDatabaseName;Trusted_Connection=Yes')
cursor = cnxn.cursor()
for subdir, dirs, files in os.walk(fpath):
for file in files:
#print(os.path.join(subdir,file))
filepath = os.path.join(subdir,file)
#print("FilePath: ", filepath)
if filepath.endswith(".txt"):
if file != "ClosedAging_Cont.txt":
txtdata = open(filepath, 'r')
script = txtdata.read().strip()
txtdata.close()
cursor.execute(script)
if file == "ClosedAging.txt":
txtdata = open(os.path.join(subdir,"ClosedAging_Cont.txt"), 'r')
script = txtdata.read().strip()
txtdata.close()
cursor.execute(script)
col = [desc[0] for desc in cursor.description]
data = cursor.fetchall()
df = pd.DataFrame(list(data),columns=col)
#save_xls(df,xlfile)
writer = pd.ExcelWriter(xlfile)
flnm = file.replace('.txt','').strip()
df.to_excel(writer,sheet_name=flnm,index=False)
writer.save()
print(file, " : Successfully Updated.")
else:
print(file, " : Ignoring this File")
else:
print(file, " : Ignoring this File")
ctypes.windll.user32.MessageBoxW(0,"Open Case Reporting Data Successfully Updated","Open Case Reporting",1)
</pre></code>
By looping through the text files, you overwrite the Excel file inside the loop each time. Instead instantiate pd.ExcelWriter(xlfile) and call writer.save() outside the loop.
The following example is adapted from the xlswriter documentation
You can find more information about multiple sheets here: xlswriter documentaion - multiple sheets
import pandas as pd
# Create a Pandas Excel writer using XlsxWriter as the engine outside the loop.
writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')
# Sample loop, replace with directory browsing loop
for i in range(7):
# Sample Pandas dataframe. Replace with SQL query and resulting data frame.
df = pd.DataFrame({'DataFromSQLQuery': ['SQL query result {0}'.format(i)]})
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet{0}'.format(i))
# Close the Pandas Excel writer and output the Excel file.
writer.save()
The following code addresses the concrete question but is untested.
import pypyodbc
import pandas as pd
import os
import ctypes
from pandas import ExcelWriter
fpath = r"C:\MNaveed\DataScience\Python Practice New\SQL Queries"
xlfile = r"C:\MNaveed\DataScience\Python Practice New\SQL Queries\Open_Case_Data.xlsx"
cnxn = pypyodbc.connect('Driver={SQL Server};Server=MyServerName;Database=MyDatabaseName;Trusted_Connection=Yes')
cursor = cnxn.cursor()
# Create a Pandas Excel writer using XlsxWriter as the engine outside the loop
writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter')
# File loop
for subdir, dirs, files in os.walk(fpath):
for file in files:
filepath = os.path.join(subdir,file)
if filepath.endswith(".txt"):
if file != "ClosedAging_Cont.txt":
txtdata = open(filepath, 'r')
script = txtdata.read().strip()
txtdata.close()
cursor.execute(script)
if file == "ClosedAging.txt":
txtdata = open(os.path.join(subdir,"ClosedAging_Cont.txt"), 'r')
script = txtdata.read().strip()
txtdata.close()
cursor.execute(script)
col = [desc[0] for desc in cursor.description]
data = cursor.fetchall()
# Data frame from original question
df = pd.DataFrame(list(data),columns=col)
# Convert the dataframe to an XlsxWriter Excel object
flnm = file.replace('.txt','').strip()
df.to_excel(writer, sheet_name=flnm, index=False)
print(file, " : Successfully Updated.")
else:
print(file, " : Ignoring this File")
else:
print(file, " : Ignoring this File")
# Close the Pandas Excel writer and output the Excel file
writer.save()
ctypes.windll.user32.MessageBoxW(0,"Open Case Reporting Data Successfully Updated","Open Case Reporting",1)
I have a dataframe that I'm exporting to Excel, and people want it in .xlsx. I use to_excel, but when I change the extension from .xls to .xlsx, the exporting step takes about 9 seconds as opposed to 1 second. Exporting to a .csv is even faster, which I believe is due to the fact that it's just a specially formatted text file.
Perhaps the .xlsx files just added a lot more features so it takes longer to write to them, but I'm hoping there is something I can do to prevent this.
Pandas defaults to using OpenPyXL for writing xlsx files which can be slower than than the xlwt module used for writing xls files.
Try it instead with XlsxWriter as the xlsx output engine:
df.to_excel('file.xlsx', sheet_name='Sheet1', engine='xlsxwriter')
It should be as fast as the xls engine.
As per different Python to Excel modules benchmark, pyexcelerate has better performance.
Below code used to take sqlite tables data into xlsx file datasheets. table is not stored in xlsx file unless raw size is less than 1000000 raws. In that case info is stored in csv file.
def passfile(datb, tables):
"""copy to xlsx or csv files tables from query results"""
import sqlite3
import pandas as pd
import timeit
import csv
from pyexcelerate import Workbook
from pathlib import Path
from datetime import date
dat_dir = Path("C:/XML")
db_path = dat_dir / datb
start_time = timeit.default_timer()
conn = sqlite3.connect(db_path) # database connection
c = conn.cursor()
today = date.today()
tablist = []
with open(tables, 'r') as csv_file: # tables to be collected file
csv_reader = csv.DictReader(csv_file)
for line in csv_reader:
tablist.append(line['table']) #column header
xls_file = "Param" + today.strftime("%y%m%d") + ".xlsx"
xls_path = dat_dir / xls_file # xls file path-name
csv_path = dat_dir / "csv" # csv path to store big data
wb = Workbook() # excelerator file init
for line in tablist:
try:
df = pd.read_sql_query("select * from " + line + ";", conn) # pandas dataframe from sqlite
if len(df) > 1000000: # excel not supported
print('save to csv')
csv_loc = line + today.strftime("%y%m%d") + '.csv.gz' # compressed csv file name
df.to_csv(csv_path / csv_loc, compression='gzip')
else:
data = [df.columns.tolist()] + df.values.tolist()
data = [[index] + row for index, row in zip(df.index, data)]
wb.new_sheet(line, data=data)
except sqlite3.Error as error: # sqlite error handling
print('SQLite error: %s' % (' '.join(error.args)))
print("saving workbook")
wb.save(xls_path)
end_time = timeit.default_timer()
delta = round(end_time - start_time, 2)
print("Took " + str(delta) + " secs")
c.close()
conn.close()
passfile("20200522_sqlite.db", "tablesSQL.csv")