Read CSV from within Zip File

Read CSV from within Zip File - python

I have a directory of zip files (approximately 10,000 small files), within each is a CSV file I am trying to read and split into a number of different CSV files.
I managed to write the code to split the CSV files from a directory of CSVs, shown below, that reads the first atttribute of the CSV, and depending what it is write it to the relevent CSV.
import csv
import os
import sys
import re
import glob
reader = csv.reader(open("C:/Projects/test.csv", "rb"), delimiter=',', quotechar='"')
write10 = csv.writer(open('ouput10.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
write15 = csv.writer(open('ouput15.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
headings10=["RECORD_IDENTIFIER","CUSTODIAN_NAME","LOCAL_CUSTODIAN_NAME","PROCESS_DATE","VOLUME_NUMBER","ENTRY_DATE","TIME_STAMP","VERSION","FILE_TYPE"]
write10.writerow(headings10)
headings15=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","USRN","STREET_DESCRIPTION","LOCALITY_NAME","TOWN_NAME","ADMINSTRATIVE_AREA","LANGUAGE"]
write15.writerow(headings15)
for row in reader:
type = row[0]
if "10" in type:
write10.writerow(row)
elif "15" in type:
write15.writerow(row)
So I am now trying to read the Zip files rather than wasting time extracting them first.
This is what I have so far after following as many tutorials as I have found
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv = '.'.join([dataFile, 'csv'])
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv))
reader = csv.reader(data)
for row in reader:
print row
However and error gets thrown
AttributeError: 'str' object has no attribute 'reader'
Hopefully someone can show me how to change my CSV reading code that works to read the Zip file.
Much appreciated
Tim

Simple fix. You're overriding the csv module with your local csv variable. Just change the name of that variable:
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv_file = '.'.join([dataFile, 'csv']) #all fixed
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv_file)) #don't forget this line!
reader = csv.reader(data)
for row in reader:
print row

Related

How process csv to hdfs using apache nifi?

Hello guys hope you doing well !
I have some csv files want to put them in hdfs and if a file already exists it should append his content to the existing content I tries a script in python but with no results
import os
import pandas as pd
from os import path
import sys,json
import csv
from csv import writer,reader
data = json.load(sys.stdin)
technologies = ['KPI_2G_NPO','GPRS']
old_path = data["old.path"]
filename = data["filename"]
old_path = old_path.replace("C:\\Users\\12\\Desktop\\APACHE~1\\NIFI-1~1.1\\","")
old_path = old_path.replace("/","")
old_path_list = old_path.split('\\')
def append_list_as_row(file_name, list_of_elem):
with open(file_name, 'a+', newline='') as write_obj:
csv_writer = writer(write_obj)
csv_writer.writerow(list_of_elem)
df = pd.read_csv(data["new.path"]+data["filename"])
columns = df.columns.values.tolist()
for tech in technologies:
if (tech in filename and old_path_list[0] in filename):
if path.exists("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv"):
header_saved = True
with open(data["new.path"]+data["filename"]) as file2:
header = next(file2)
header = next(file2)
if header_saved:
for line in file2:
append_list_as_row("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv",list(line.split(",")))
os.remove(data["new.path"]+data["filename"])
else:
df.to_csv("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv")
os.remove(data["new.path"]+data["filename"])
and here's my nifi pipline picture

How to Read CSV, Create a QR Code, and Write its Filename to New Column?

I'm writing a Python script to generate a QR code from the first column in a csv (concatenated with a local name), and that part works well. The csv just has three columns and looks like this:
ID First Last
144 Jerry Seinfeld
491 George Costanza
104 Elaine Benes
99 Cosmo Kramer
And I use my Python script to take that file, append a prefix to the IDs (in this case, 'NBC') and then create QR codes for each record in a new folder. It's a little long but all of this seems to work fine also:
import csv
import qrcode
import os
import shutil
import time
import inquirer
#Identify Timestamp
timestr = time.strftime("%Y%m%d-%H%M%S")
local = 'NBC'
#Load csv
filename = "stackoverflowtest.csv"
#Path to new local folder
localfolder = local
localimagefolder = localfolder+'/image'
localfilefolder = localfolder+'/files'
#Check/create folders based on local
if not os.path.exists(localfolder):
os.makedirs(localfolder)
if not os.path.exists(localimagefolder):
os.makedirs(localimagefolder)
if not os.path.exists(localfilefolder):
os.makedirs(localfilefolder)
#Copy uploaded file to their local's file folder
shutil.copy2(filename, localfilefolder+'/'+local+'-'+timestr+'.csv') # complete target filename given
#Read csv and generate QR code for local+first column of csv
with open(filename, 'rU') as csvfile:
next(csvfile, None) #skip header row
reader = csv.reader(csvfile, delimiter=',', dialect=csv.excel_tab)
for i, row in enumerate(reader):
labeldata = row[0] #Choose first column of data to create QR codes
print labeldata
qr = qrcode.QRCode(
version=1,
error_correction=qrcode.constants.ERROR_CORRECT_L,
box_size=10,
border=4,
)
qr.add_data(local+"-"+labeldata)
qr.make()
img = qr.make_image()
img.save(localimagefolder+"/"+local+"-"+labeldata+".png".format(i)) #Save image
It creates the NBC folder, copies each csv file in one subfolder, and creates the QR codes for each ID (NBC-144,NBC-491,NBC-104,NBC-99) in another.
The part where I'm running into a problem is opening the csv and writing the filepath/filename back to the csv (or a copy of the csv since from what I've read, I likely can't do it to the same one). Is that possible?
The closest I've come with a script that works is appending the local name with the ID and writing that back to a column but I can't seem to figure out how to do the same with a variable, let alone a filepath/filename:
import csv
import os
import sys
filename = 'stackoverflowtest.csv'
newfilename = 'stackoverflowtest2.csv'
local = 'NBC'
with open(filename, 'rU') as f:
reader = csv.reader(f)
with open(newfilename, 'w') as g:
writer = csv.writer(g)
for row in reader:
new_row = row[0:] + ['-'.join([local, row[0]])]
writer.writerow(new_row)
Is it possible to write something like that within my existing script to add a column for the filepath and filename? Everything I try breaks -- especially if I attempt to do it in the same script.
EDIT:
This is my closest attempt that overwrote the existing file
f=open(newfilename,'r+')
w=csv.writer(f)
for path, dirs, files in os.walk(path):
for filename in files:
w.writerow([newfilename])
Also it's still in a separate script.

Since I can't run the code in your question directly, I had to commented-out portions of it in what's below for testing, but think it does everything you wanted in one loop in one script.
import csv
#import qrcode
import os
import shutil
import time
#import inquirer
# Identify Timestamp
timestr = time.strftime("%Y%m%d-%H%M%S")
local = 'NBC'
# Load csv
filename = "stackoverflowtest.csv"
# Path to new local folder
localfolder = local
localimagefolder = os.path.join(localfolder, 'image')
localfilefolder = os.path.join(localfolder, 'files')
# Check/create folders based on local
if not os.path.exists(localfolder):
os.makedirs(localfolder)
if not os.path.exists(localimagefolder):
os.makedirs(localimagefolder)
if not os.path.exists(localfilefolder):
os.makedirs(localfilefolder)
# Copy uploaded file to their local's file folder
target = os.path.join(localfilefolder, local+'-'+timestr+'.csv') # Target filename
#shutil.copy2(filename, target) # Don't need to do this.
# Read csv and generate QR code for local+first column of csv
with open(filename, 'rb') as csvfile, open(target, 'wb') as outfile:
reader = csv.reader(csvfile, delimiter=',', dialect=csv.excel_tab)
writer = csv.writer(outfile, delimiter=',', dialect=csv.excel_tab)
next(reader) # Skip header row.
for row in reader:
id, first, last = row
# qr = qrcode.QRCode(
# version=1,
# error_correction=qrcode.constants.ERROR_CORRECT_L,
# box_size=10,
# border=4,
# )
#
# qr.add_data(local+"-"+id)
# qr.make()
#
# img = qr.make_image()
imagepath = os.path.join(localimagefolder, local+"-"+id+".png")
# img.save(imagepath) # Save image.
print "saving img:", imagepath
writer.writerow(row + [local+'-'+id, imagepath])
Output from sample input data:
144,Jerry,Seinfeld,NBC-144,NBC/image/NBC-144.png
491,George,Costanza,NBC-491,NBC/image/NBC-491.png
104,Elaine,Benes,NBC-104,NBC/image/NBC-104.png
99,Cosmo,Kramer,NBC-99,NBC/image/NBC-99.png

Process multiple files in Python with one code

I'm having some trouble with my code. I want to do the following: I have about 30 csv files which I need to process and I want the result in one single csv file. So far I have the following code (sample):
import pandas as pd
import csv as csv
df = pd.read_csv ("file.csv",
delimiter=",",skiprows=1)
datamean10=df[61:240].mean()
datamean15=df[241:420].mean()
list10=[]
list15=[]
list10.append(datamean10.clip(0))
list15.append(datamean15.clip(0))
csvfile = "C:/Users/bla/bla/list10.csv"
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list10)
csvfile = "C:/Users/bla/bla/list15.csv"
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list15)
So this code does what I want for a single file. I wrote another script to go through all the files:
import sys, os
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
for file in files:
if file.endswith('.csv'):
os.system ('averagelist.py {}'.format(root + '\\' + file))
Needless to say this script deletes the lists again and starts off with a blank list while I want to append the results to the same list. So that the resulting files have a row of average values for each file that is processed. Can somebody tell me how to do this? Thank you very much in advance.

This should be a working combination.
import pandas as pd
import csv as csv
df = pd.read_csv ("file.csv",
delimiter=",",skiprows=1)
datamean10=df[61:240].mean()
datamean15=df[241:420].mean()
list10=[]
list15=[]
list10.append(datamean10.clip(0))
list15.append(datamean15.clip(0))
csvfile = "C:/Users/bla/bla/list10.csv"
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list10)
import sys, os
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
for file in files:
if file.endswith('.csv'):
#csvfile = "C:/Users/bla/bla/list15.csv"
with open(file, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list15)

Just open your output files in append mode inside the inner script:
with open(csvfile, 'a') as output:
and truncate then only in master script:
with open("C:/Users/bla/bla/list10.csv", 'w') as output:
pass
with open("C:/Users/bla/bla/list15.csv", 'w') as output:
pass
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
...
But using system to call a Python script from another Python script is bad. I would be better to write the working part of the inner script in a function, and then use it from the outer script after importing it:
Inner script (averagelist.py)
import pandas as pd
import csv as csv
def average(file):
df = pd.read_csv (file,
delimiter=",",skiprows=1)
...
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list15)
outer script:
import os
import averagelist
with open("C:/Users/bla/bla/list10.csv", 'w') as output:
pass
with open("C:/Users/bla/bla/list15.csv", 'w') as output:
pass
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
for file in files:
if file.endswith('.csv'):
averagelist.average(root + '\\' + file))

copy and rename images based on a csv managing duplicates python

I'm working on a script in python to copy and rename a bunch of images based on a csv.
The image folder named "originals" is like:
Alpha_1.jpg
Beta_2.jpg
And the csv. contains:
Alfa_1.jpg 4474094_1
Beta_2.jpg 4474094_2
Beta_2.jpg 4474094_3
So the result should leave on a new folder named "newnames" the sames images like:
4474094_1.jpg
4474094_2.jpg
4474094_3.jpg
Where the 4474094_2.jpg and 4474094_3.jpg is the same picture as Beta_2.jpg
I have the following code which is not working, please any advice I would be most grateful!
import os
import csv
import sys
import shutil
def copy_rename():
os.chdir(r"C:\Transformer-SSBI\Original")
saved_path = os.getcwd()
file_list = os.listdir(saved_path)
src_dir= r"C:\Transformer-SSBI\Originals"
dst_dir= r"C:\Transformer-SSBI\Newnames"
IDs = {}
with open (r'transformer.csv','rb') as csvfile:
Reader = csv.reader(csvfile, delimiter = ';')
for row in Reader:
IDs[row[0]] = row[1]+'.jpg'
for row in IDs:
for file_name in file_list:
if file_name in row:
oldname = shutil.copy(file_name,dst_dir)
newname = IDs[file_name]
os.rename(oldname, newname)
copy_rename()

since you store the mapping in a map, and Beta_2.jpg is renamed to two files,there can be only one key in the map,so it will only be renamed to 4474094_3.jpg,not 4474094_2.jpg,you can avoid the construction of map, and just do the renaming while iterating the csv files:
import os
import csv
import sys
import shutil
def copy_rename():
src_dir= r"C:\Transformer-SSBI\Originals"
dst_dir= r"C:\Transformer-SSBI\Newnames"
or.chdir(dst_dir)
with open (r'transformer.csv','rb') as csvfile:
Reader = csv.reader(csvfile, delimiter = ',')
for row in Reader:
oldname=row[0]
newname=row[1]+".jpg"
if os.path.exists(src_dir+"\\"+oldname):
shutil.copy(src_dir+"\\"+oldname,dst_dir)
os.rename(oldname, newname)
copy_rename()

This is like #Samuelliyi answer, except it avoids any race conditions and is (slightly) more cross platform by using os.path.join.
import os
import csv
import sys
import errno
import shutil
def copy_rename(src_dir, dst_dir, csv_path=None):
if csv_path is None:
csv_path = os.path.join(dst_dir, 'transformer.csv')
with open (csv_path, mode='rb') as csvfile:
Reader = csv.reader(csvfile, delimiter = ',')
for row in Reader:
oldname = row[0]
newname = row[1] + os.path.splitext(oldname)[1]
oldpath = os.path.join(src_dir, oldname)
newpath = os.path.join(dst_dir, newname)
try:
# the rename is implicit in the copy operation
shutil.copy(oldpath, newpath)
except OSError as e:
# only raise exception if it is something other than the file
# not existing
if e.errno != errno.ENOENT:
raise
src_dir= r"C:\Transformer-SSBI\Originals"
dst_dir= r"C:\Transformer-SSBI\Newnames"
copy_rename(src_dir, dst_dir)
Also, the function is now more general and can be used on any two directories that have the same structure (don't hardcode what you can pass in as a parameter).

convert multiple excel files to csv files with python

Is there a way to convert multiple files?
Can I use glob.glob?
import sys
sys.path.insert(0,'D:/apera/Python27/xlrd-0.9.3')
import xlrd
import csv
ExcelFile = "D:/apera/Workspace/Sounding/sounding010.xls"
CSVFile = "D:/apera/Workspace/Sounding/sounding010.csv"
def Convert(ExcelFile, SheetName, CSVFile):
wb = xlrd.open_workbook(ExcelFile)
ws = wb.sheet_by_name(SheetName)
csvfile = open(CSVFile, 'wb')
wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL, delimiter=';')
for rownum in xrange(ws.nrows):
wr.writerow(
list(x.encode('latin1')
for x in ws.row_values(rownum)))
csvfile.close()
Convert(ExcelFile, "INPUT", CSVFile)

Yes, glob combined with os will work
import os
import glob
os.chdir("yourfolder")
for f in glob.glob("*.xls"):
#call your conversion function

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Read CSV from within Zip File - python

Related

How process csv to hdfs using apache nifi?

How to Read CSV, Create a QR Code, and Write its Filename to New Column?

Process multiple files in Python with one code

copy and rename images based on a csv managing duplicates python

convert multiple excel files to csv files with python

Categories

Resources