Compress excel file in python - python

Right now my final output is in excel format. I wanted to compressed my excel file using gzip. Is there a way to do it ?
import pandas as pd
import gzip
import re
def renaming_ad_unit():
with gzip.open('weekly_direct_house.xlsx.gz') as f:
df = pd.read_excel(f)
result = df['Ad unit'].to_list()
for index, a_string in enumerate(result):
modified_string = re.sub(r"\([^()]*\)", "", a_string)
df.at[index,'Ad unit'] = modified_string
return df.to_excel('weekly_direct_house.xlsx',index=False)

Yes, this is possible.
To create a gzip file, you can open the file like this:
with gzip.open('filename.xlsx.gz', 'wb') as f:
...
Unfortunately, when I tried this, I found that I get the error OSError: Negative seek in write mode. This is because the Pandas excel writer moves backwards in the file when writing, and uses multiple passes to write the file. This is not allowed by the gzip module.
To fix this, I created a temporary file, and wrote the excel file there. Then, I read the file back, and write it to the compressed archive.
I wrote a short program to demonstrate this. It reads an excel file from a gzip archive, prints it out, and writes it back to another gzip file.
import pandas as pd
import gzip
import tempfile
def main():
with gzip.open('apportionment-2020-table02.xlsx.gz') as f:
df = pd.read_excel(f)
print(df)
with tempfile.TemporaryFile() as excel_f:
df.to_excel(excel_f, index=False)
with gzip.open('output.xlsx.gz', 'wb') as gzip_f:
excel_f.seek(0)
gzip_f.write(excel_f.read())
if __name__ == '__main__':
main()
Here's the file I'm using to demonstrate this: Link

You could also use io.BytesIO to create file in memory and write excel in this file and next write this file as gzip on disk.
I used link to excel file from Nick ODell answer.
import pandas as pd
import gzip
import io
df = pd.read_excel('https://www2.census.gov/programs-surveys/decennial/2020/data/apportionment/apportionment-2020-table02.xlsx')
buf = io.BytesIO()
df.to_excel(buf)
buf.seek(0) # move to the beginning of file
with gzip.open('output.xlsx.gz', 'wb') as f:
f.write(buf.read())
Similar to Nick ODell answer.
import pandas as pd
import gzip
import io
df = pd.read_excel('https://www2.census.gov/programs-surveys/decennial/2020/data/apportionment/apportionment-2020-table02.xlsx')
with io.BytesIO() as buf:
df.to_excel(buf)
buf.seek(0) # move to the beginning of file
with gzip.open('output.xlsx.gz', 'wb') as f:
f.write(buf.read())
Tested on Linux

Related

How to read a csvfile on FTP that is compressed on a zip/folder

I'm trying to :
read a .csv file (compressed in a zipfile that is stored on FTP) by using ftplib
store the .csv file on a virtual file on memory by using io
transform the virutal file to a dataframe by using pandas
For that I'm using the code below and it works really fine for the first scenario (path1, see image above) :
CODE :
import ftplib
import zipfile
import io
import pandas as pd
ftp = ftplib.FTP("theserver_name")
ftp.login("my_username","my_password")
ftp.encoding = "utf-8"
ftp.cwd('folder1/folder2')
filename = 'zipFile1.zip'
download_file = io.BytesIO()
ftp.retrbinary("RETR " + filename, download_file.write)
download_file.seek(0)
zfile = zipfile.ZipFile(download_file)
df = pd.read_csv(zfile.namelist()[0], delimiter=';')
display(df)
But in the second scenario (path2) and after changing my code, I get the error below :
CODE UPDATE :
ftp.cwd('folder1/folder2/')
filename = 'zipFile2.zip'
ERROR AFTER UPDATE :
FileNotFoundError: [Errno 2] No such file or directory:
'folder3/csvFile2.csv'
It seems like Python don't recognize the folder3 (contained in the zipFile2). Is there any explanation for that, please ? How can we fix that ? I tried with ftp.cwd('folder3') right before pd.read.csv() but it doesn't work..
Thanks to Serge Ballesta in his post here, I finally figure out how to transform csvFile2.csv to a DataFrame :
import ftplib
import zipfile
import io
import pandas as pd
ftp = ftplib.FTP("theserver_name")
ftp.login("my_username","my_password")
ftp.encoding = "utf-8"
flo = io.BytesIO()
ftp.retrbinary('RETR /folder1/folder2/zipFile2.zip', flo.write)
flo.seek(0)
with zipfile.ZipFile(flo) as archive:
with archive.open('folder3/csvFile2.csv') as fd:
df = pd.read_csv(fd, delimiter=';')
display(df)

How to stream a large gzipped .tsv file from s3, process it, and write back to a new file on s3?

I have a large file s3://my-bucket/in.tsv.gz that I would like to load and process, write back its processed version to an s3 output file s3://my-bucket/out.tsv.gz.
How do I streamline the in.tsv.gz directly from s3 without loading all the file to memory (it cannot fit the memory)
How do I write the processed gzipped stream directly to s3?
In the following code, I show how I was thinking to load the input gzipped dataframe from s3, and how I would write the .tsv if it were located locally bucket_dir_local = ./.
import pandas as pd
import s3fs
import os
import gzip
import csv
import io
bucket_dir = 's3://my-bucket/annotations/'
df = pd.read_csv(os.path.join(bucket_dir, 'in.tsv.gz'), sep='\t', compression="gzip")
bucket_dir_local='./'
# not sure how to do it with an s3 path
with gzip.open(os.path.join(bucket_dir_local, 'out.tsv.gz'), "w") as f:
with io.TextIOWrapper(f, encoding='utf-8') as wrapper:
w = csv.DictWriter(wrapper, fieldnames=['test', 'testing'], extrasaction="ignore")
w.writeheader()
for index, row in df.iterrows():
my_dict = {"test": index, "testing": row[6]}
w.writerow(my_dict)
Edit: smart_open looks like the way to go.
Here is a dummy example to read a file from s3 and write it back to s3 using smart_open
from smart_open import open
import os
bucket_dir = "s3://my-bucket/annotations/"
with open(os.path.join(bucket_dir, "in.tsv.gz"), "rb") as fin:
with open(
os.path.join(bucket_dir, "out.tsv.gz"), "wb"
) as fout:
for line in fin:
l = [i.strip() for i in line.decode().split("\t")]
string = "\t".join(l) + "\n"
fout.write(string.encode())
For downloading the file you can stream the S3 object directly in python. I'd recommend reading that entire post but some key lines from it
import boto3
s3 = boto3.client('s3', aws_access_key_id='mykey', aws_secret_access_key='mysecret') # your authentication may vary
obj = s3.get_object(Bucket='my-bucket', Key='my/precious/object')
import gzip
body = obj['Body']
with gzip.open(body, 'rt') as gf:
for ln in gf:
process(ln)
Unfortunately S3 doesn't support true streaming input but this SO answer has an implementation that chunks out the file and sends each chunk up to S3. While not a "true stream" it will let you upload large files without needing to keep the entire thing in memory

How to read a specific file from a tar file using Windows?

I have a tar file with several files compressed in it. I need to read one specific file (it is in csv format) using pandas. I tried to use the following code:
import tarfile
tar = tarfile.open('my_files.tar', 'r:gz')
f = tar.extractfile('some_files/need_to_be_read.csv')
import pandas as pd
df = pd.read_csv(f.read())
but it throws up the following error:
OSError: Expected file path name or file-like object, got <class 'bytes'> type
on the last line of the code. How do I go about this to read this file?
When you call pandas.read_csv(), you need to give it a filename or file-like object. tar.extractfile() returns a file-like object. Instead of reading the file into memory, pass the file to Pandas.
So remove the .read() part:
import tarfile
tar = tarfile.open('my_files.tar', 'r:gz')
f = tar.extractfile('some_files/need_to_be_read.csv')
import pandas as pd
df = pd.read_csv(f)

how to convert xlsx to tab delimited files

I have quite a lot of xlsx files which is a pain to convert them one by one to tab delimited files
I would like to know if there is any solution to do this by python. Here what I found and what tried to do with failure
This I found and I tried the solution but did not work Mass Convert .xls and .xlsx to .txt (Tab Delimited) on a Mac
I also tried to do it for one file to see how it works but with no success
#!/usr/bin/python
import xlrd
import csv
def main():
# I open the xlsx file
myfile = xlrd.open_workbook('myfile.xlsx')
# I don't know the name of sheet
mysheet = myfile.sheet_by_index(0)
# I open the output csv
myCsvfile = open('my.csv', 'wb')
# I write the file into it
wr = csv.writer(myCsvfile, delimiter="\t")
for rownum in xrange(mysheet.nrows):
wr.writerow(mysheet.row_values(rownum))
myCsvfile.close()
if __name__ == '__main__':
main()
No real need for the main function.
And not sure about your indentation problems, but this is how I would write what you have. (And should work, according to first comment above)
#!/usr/bin/python
import xlrd
import csv
# open the output csv
with open('my.csv', 'wb') as myCsvfile:
# define a writer
wr = csv.writer(myCsvfile, delimiter="\t")
# open the xlsx file
myfile = xlrd.open_workbook('myfile.xlsx')
# get a sheet
mysheet = myfile.sheet_by_index(0)
# write the rows
for rownum in xrange(mysheet.nrows):
wr.writerow(mysheet.row_values(rownum))
Why go with so much pain when you can do it in 3 lines:
import pandas as pd
file = pd.read_excel('myfile.xlsx')
file.to_csv('myfile.xlsx',
sep="\t",
index=False)

reading gzipped csv file in python 3

I'm having problems reading from a gzipped csv file with the gzip and csv libs. Here's what I got:
import gzip
import csv
import json
f = gzip.open(filename)
csvobj = csv.reader(f,delimiter = ',',quotechar="'")
for line in csvobj:
ts = line[0]
data_json = json.loads(line[1])
but this throws an exception:
File "C:\Users\yaronol\workspace\raw_data_from_s3\s3_data_parser.py", line 64, in download_from_S3
self.parse_dump_file(filename)
File "C:\Users\yaronol\workspace\raw_data_from_s3\s3_data_parser.py", line 30, in parse_dump_file
for line in csvobj:
_csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
gunzipping the file and opening that with csv works fine. I've also tried decoding the file text to convert from bytes to str...
What am I missing here?
Default mode for gzip.open is rb, if you wish to work with strs, you have to specify it extra:
f = gzip.open(filename, mode="rt")
OT: it is a good practice to write I/O operations in a with block:
with gzip.open(filename, mode="rt") as f:
You are opening the file in binary mode (which is the default for gzip).
Try instead:
import gzip
import csv
f = gzip.open(filename, mode='rt')
csvobj = csv.reader(f,delimiter = ',',quotechar="'")
too late, you can use datatable package in python
import datatable as dt
df = dt.fread(filename)
df.head()

Categories

Resources