Is there a way to convert multiple files?
Can I use glob.glob?
import sys
sys.path.insert(0,'D:/apera/Python27/xlrd-0.9.3')
import xlrd
import csv
ExcelFile = "D:/apera/Workspace/Sounding/sounding010.xls"
CSVFile = "D:/apera/Workspace/Sounding/sounding010.csv"
def Convert(ExcelFile, SheetName, CSVFile):
wb = xlrd.open_workbook(ExcelFile)
ws = wb.sheet_by_name(SheetName)
csvfile = open(CSVFile, 'wb')
wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL, delimiter=';')
for rownum in xrange(ws.nrows):
wr.writerow(
list(x.encode('latin1')
for x in ws.row_values(rownum)))
csvfile.close()
Convert(ExcelFile, "INPUT", CSVFile)
Yes, glob combined with os will work
import os
import glob
os.chdir("yourfolder")
for f in glob.glob("*.xls"):
#call your conversion function
Related
I have 3602 txt files and I'm trying to merge them into a csv file
My code seems to work, but when checking dataframe lenght it shows 5194 rows
I'm pretty sure something like commas or spaces are separating the file into more than one row
import pandas as pd
import glob
from pathlib import Path
import csv
glob_path_false = Path(r"C:\Users\julia\repos\pos\fake.br\Fake.br-Corpus\size_normalized_texts\fake")
file_list = [str(pp) for pp in glob_path_false.glob("**/*.txt")]
# Cria o .csv com todos os artigos true
column_names = ['artigo']
with open("result_fake.csv", 'w', newline ='', encoding="utf8") as target:
writer = csv.DictWriter(target, fieldnames=column_names, extrasaction='ignore', delimiter = '/')
writer.writeheader()
for path in file_list:
with open(path, 'r', newline ='',encoding="utf8") as source:
reader = csv.DictReader(source, delimiter='/', fieldnames=column_names)
writer.writerows(reader)
df_fake = pd.read_csv('result_fake.csv', sep='/')
df_fake.count():
artigo 5194
dtype: int64
I try to export 2 specific lines from multiple (.txt) files in path as excel rows, and exporting them into excel files.. But the (.xlsx) file just containing first row (just exported lines of one text file).
import pandas as pd
import linecache
import xlsxwriter as xlsw
import os
import glob
directory=('C:\Users\john\Desktop')
os.chdir(directory)
files=glob.glob('*.txt')
for filename in files:
name = linecache.getline(filename,5)
id = linecache.getline(filename,13)
info = [name,id]
final_list = []
for i in info:
final_list.append(i.strip())
print (final_list)
df = pd.DataFrame(final_list)
df = df.transpose()
writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcome',startrow=1,startcol=0, header=False, index=False)
writer.save()
Use:
import pandas as pd
import linecache
import xlsxwriter as xlsw
import os
import glob
directory=('test')
os.chdir(directory)
files=glob.glob('*.txt')
final_list = []
for filename in files:
name = linecache.getline(filename,5)
id = linecache.getline(filename,13)
info = [name,id]
final_list.append([i.strip() for i in info])
print (final_list)
df = pd.DataFrame(final_list)
writer = pd.ExcelWriter('test.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='welcome',startrow=1,startcol=0, header=False, index=False)
writer.save()
Hello guys hope you doing well !
I have some csv files want to put them in hdfs and if a file already exists it should append his content to the existing content I tries a script in python but with no results
import os
import pandas as pd
from os import path
import sys,json
import csv
from csv import writer,reader
data = json.load(sys.stdin)
technologies = ['KPI_2G_NPO','GPRS']
old_path = data["old.path"]
filename = data["filename"]
old_path = old_path.replace("C:\\Users\\12\\Desktop\\APACHE~1\\NIFI-1~1.1\\","")
old_path = old_path.replace("/","")
old_path_list = old_path.split('\\')
def append_list_as_row(file_name, list_of_elem):
with open(file_name, 'a+', newline='') as write_obj:
csv_writer = writer(write_obj)
csv_writer.writerow(list_of_elem)
df = pd.read_csv(data["new.path"]+data["filename"])
columns = df.columns.values.tolist()
for tech in technologies:
if (tech in filename and old_path_list[0] in filename):
if path.exists("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv"):
header_saved = True
with open(data["new.path"]+data["filename"]) as file2:
header = next(file2)
header = next(file2)
if header_saved:
for line in file2:
append_list_as_row("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv",list(line.split(",")))
os.remove(data["new.path"]+data["filename"])
else:
df.to_csv("hdfs://quickstart.cloudera:8020/user/cloudera/data/"+tech+"_"+old_path_list[0]+".csv")
os.remove(data["new.path"]+data["filename"])
and here's my nifi pipline picture
I'm having some trouble with my code. I want to do the following: I have about 30 csv files which I need to process and I want the result in one single csv file. So far I have the following code (sample):
import pandas as pd
import csv as csv
df = pd.read_csv ("file.csv",
delimiter=",",skiprows=1)
datamean10=df[61:240].mean()
datamean15=df[241:420].mean()
list10=[]
list15=[]
list10.append(datamean10.clip(0))
list15.append(datamean15.clip(0))
csvfile = "C:/Users/bla/bla/list10.csv"
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list10)
csvfile = "C:/Users/bla/bla/list15.csv"
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list15)
So this code does what I want for a single file. I wrote another script to go through all the files:
import sys, os
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
for file in files:
if file.endswith('.csv'):
os.system ('averagelist.py {}'.format(root + '\\' + file))
Needless to say this script deletes the lists again and starts off with a blank list while I want to append the results to the same list. So that the resulting files have a row of average values for each file that is processed. Can somebody tell me how to do this? Thank you very much in advance.
This should be a working combination.
import pandas as pd
import csv as csv
df = pd.read_csv ("file.csv",
delimiter=",",skiprows=1)
datamean10=df[61:240].mean()
datamean15=df[241:420].mean()
list10=[]
list15=[]
list10.append(datamean10.clip(0))
list15.append(datamean15.clip(0))
csvfile = "C:/Users/bla/bla/list10.csv"
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list10)
import sys, os
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
for file in files:
if file.endswith('.csv'):
#csvfile = "C:/Users/bla/bla/list15.csv"
with open(file, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list15)
Just open your output files in append mode inside the inner script:
with open(csvfile, 'a') as output:
and truncate then only in master script:
with open("C:/Users/bla/bla/list10.csv", 'w') as output:
pass
with open("C:/Users/bla/bla/list15.csv", 'w') as output:
pass
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
...
But using system to call a Python script from another Python script is bad. I would be better to write the working part of the inner script in a function, and then use it from the outer script after importing it:
Inner script (averagelist.py)
import pandas as pd
import csv as csv
def average(file):
df = pd.read_csv (file,
delimiter=",",skiprows=1)
...
with open(csvfile, 'w') as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerows(list15)
outer script:
import os
import averagelist
with open("C:/Users/bla/bla/list10.csv", 'w') as output:
pass
with open("C:/Users/bla/bla/list15.csv", 'w') as output:
pass
for root, dir, files in os.walk('C:/Users/bla/bla/bla'):
for file in files:
if file.endswith('.csv'):
averagelist.average(root + '\\' + file))
I have a directory of zip files (approximately 10,000 small files), within each is a CSV file I am trying to read and split into a number of different CSV files.
I managed to write the code to split the CSV files from a directory of CSVs, shown below, that reads the first atttribute of the CSV, and depending what it is write it to the relevent CSV.
import csv
import os
import sys
import re
import glob
reader = csv.reader(open("C:/Projects/test.csv", "rb"), delimiter=',', quotechar='"')
write10 = csv.writer(open('ouput10.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
write15 = csv.writer(open('ouput15.csv', 'w'), delimiter=',', lineterminator='\n', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
headings10=["RECORD_IDENTIFIER","CUSTODIAN_NAME","LOCAL_CUSTODIAN_NAME","PROCESS_DATE","VOLUME_NUMBER","ENTRY_DATE","TIME_STAMP","VERSION","FILE_TYPE"]
write10.writerow(headings10)
headings15=["RECORD_IDENTIFIER","CHANGE_TYPE","PRO_ORDER","USRN","STREET_DESCRIPTION","LOCALITY_NAME","TOWN_NAME","ADMINSTRATIVE_AREA","LANGUAGE"]
write15.writerow(headings15)
for row in reader:
type = row[0]
if "10" in type:
write10.writerow(row)
elif "15" in type:
write15.writerow(row)
So I am now trying to read the Zip files rather than wasting time extracting them first.
This is what I have so far after following as many tutorials as I have found
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv = '.'.join([dataFile, 'csv'])
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv))
reader = csv.reader(data)
for row in reader:
print row
However and error gets thrown
AttributeError: 'str' object has no attribute 'reader'
Hopefully someone can show me how to change my CSV reading code that works to read the Zip file.
Much appreciated
Tim
Simple fix. You're overriding the csv module with your local csv variable. Just change the name of that variable:
import glob
import os
import csv
import zipfile
import StringIO
for name in glob.glob('C:/Projects/abase/*.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Projects/abase/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv_file = '.'.join([dataFile, 'csv']) #all fixed
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv_file)) #don't forget this line!
reader = csv.reader(data)
for row in reader:
print row