How to convert XLS to CSV files using python? - python

I'm working with python console of QGIS 2.8.1. I want to convert many xls files into csv format using python. My input directory is: D:\PATRICIA\TESTE\XLS and output is: D:\PATRICIA\TESTE\CSV2. I wrote this code based in your suggestions (Converting xls file into csv/txt file in Python) and developed it in differente files with different dates 1999/01/2 until 1999/01/31 as: RR_1999_1_2.xls, RR_1999_1_3.xls, ... RR_1999_1_31.xls
I don't know why my script doesn´t works. It means that nothing happened!
My script is:
import xlrd
import csv
import datetime as dt
from datetime import timedelta
#initial and final dates
data1='19990102'
data2='19990131'
anoi = int(data1[:4])
mesi = int(data1[4:6])
diai = int(data1[6:8])
anof = int(data2[:4])
mesf = int(data2[4:6])
diaf = int(data2[6:8])
start_date = dt.datetime(anoi, mesi, diai)
end_date = dt.datetime(anof, mesf, diaf)
total_days = (end_date - start_date).days + 1
for day in xrange(0, total_days):
current_date = (start_date + dt.timedelta(days = day)).date()
file_date = str(current_date.year)+'_'+str(current_date.month)+'_'+str(current_date.day)
srt1='D:/PATRICIA/TESTE/XLS/RR_'+file_date+'.xls'
srt2='D:/PATRICIA/TESTE/CSV2/RR_'+file_date+'.csv'
def xls_to_csv():
x = xlrd.open_workbook(str1)
x1 = x.sheet_by_name('Sheet1')
csvfile = open(str2, 'wb')
writecsv = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
for rownum in xrange(sh.nrows):
writecsv.writerow(x1.row_values(rownum))
csvfile.close()
Any help?
Thanks.

Unless I've missed an important thing, you declare the function xls_to_csv in a loop, but never call it. The general structure of your script should be:
#initializations
data1='19990102'
...
total_days = (end_date - start_date).days + 1
# function definition:
def xls_to_csv(str1, str2):
x = xlrd.open_workbook(str1)
x1 = x.sheet_by_name('Sheet1')
csvfile = open(str2, 'wb')
writecsv = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
for rownum in xrange(sh.nrows):
writecsv.writerow(x1.row_values(rownum))
csvfile.close()
# loop
for day in xrange(0, total_days):
current_date = (start_date + dt.timedelta(days = day)).date()
file_date = str(current_date.year)+'_'+str(current_date.month)+'_'+str(current_date.day)
srt1='D:/PATRICIA/TESTE/XLS/RR_'+file_date+'.xls'
srt2='D:/PATRICIA/TESTE/CSV2/RR_'+file_date+'.csv'
xls_to_csv(srt1, srt2) # function call

Related

Automating through multiple path locations and exporting file names

I have written a script which works but is not very elegant. It merges csv files, outputs a new file, filters that file to the required conditions, then outputs the filtered file, which is the file I want. I then repeat the process for every month.
Rather than altering this code to process every month (I have 5 more years worth of data to go), I would like to automate the path directory part and export csv file names that change from one month (and year) to the next.
See snippet of Jan and Feb below:
import os
import glob
import pandas as pd
import shutil
path = r"C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx01"
os.chdir(path)
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
combined_csv.to_csv("201401.csv", index=False, encoding='utf-8-sig')
grab1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx01\201401.csv'
move1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-01.csv'
shutil.move(grab1,move1)
fd = pd.read_csv(r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-01.csv')
df = pd.DataFrame(fd)
irishsea = df[(df.lat_bin >= 5300) & (df.lat_bin <= 5500) & (df.lon_bin >= -650) & (df.lon_bin <= -250)]
irishsea.to_csv("2014-01_irishsea.csv", index=False, encoding='utf-8-sig')
grab2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx01\2014-01_irishsea.csv'
move2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-01-IrishSea.csv'
shutil.move(grab2,move2)
I then repeat it for Feb data but have to update the path locations.
#process feb data
path = r"C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx02"
os.chdir(path)
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
combined_csv.to_csv("201402.csv", index=False, encoding='utf-8-sig')
grab1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx02\201402.csv'
move1 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-02.csv'
shutil.move(grab1,move1)
fd = pd.read_csv(r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-02.csv')
df = pd.DataFrame(fd)
irishsea = df[(df.lat_bin >= 5300) & (df.lat_bin <= 5500) & (df.lon_bin >= -650) & (df.lon_bin <= -250)]
irishsea.to_csv("2014-02_irishsea.csv", index=False, encoding='utf-8-sig')
grab2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\20xx02\2014-02_irishsea.csv'
move2 = r'C:\Users\jonathan.capanda\Documents\Fishing_DataBase\gfw_data\100_deg_data\daily_csvs\2014\2014-02-IrishSea.csv'
shutil.move(grab2,move2)
You can do something like the following. Keep in mind that the second number of range (the stop value) needs to be one value higher than you intend.
for year in range(2014, 2020):
for month in range(1, 13):
if month < 10:
month_as_string = "0" + str(month)
else:
month_as_string = str(month)
date = "%s\%s-%s" % (year, year, month_as_string)
pathname = 'YOUR\FILEPATH\HERE' + date + 'irishsea.csv'
You can learn more about string formatting here https://www.learnpython.org/en/String_Formatting

Pythonic way of using a variable as part of another variable?

That's the clearest I could make my title.
I have some code that reads in two CSV files. One CSV file has the data, and the other has information about this data... let's call it config.
data_jan2018.csv
data_feb2018.csv
config.csv
Now, config has columns for which dates I want to read in. I'm reading these in as follows:
data_config = pd.read_csv(loc + data_config_name)
# Read in dates from config file
dates = data_config.drop_duplicates('Start_date')
dates = dates[['Start_date','End_date']]
print(dates)
Start_date = dates['Start_date'].tolist()
End_date = dates['End_date'].tolist()
StartDate = ''.join(Start_date)
EndDate = ''.join(End_date)
print(StartDate)
print(EndDate)
date1 = datetime.strptime(StartDate, '%d%b%Y')
date2 = datetime.strptime(EndDate, '%d%b%Y')
# Loop across months
for dt in rrule.rrule(rrule.MONTHLY, dtstart=date1, until=date2):
print(dt)
reporting_date = dt.strftime('%d%b%Y')
reporting_date_fmt = dt.strftime(date_format)
print('Formatted reporting_date is ' + reporting_date_fmt)
source_data = pd.read_csv(loc + source_data_name)
source_data.columns = source_data.columns.str.lower()
As you can see, I want to read in a csv file called source_data_name. However, this file name contains my formatted reporting_date_fmt. I want the programmer to edit the file name at the beginning of the code so I have these line right at the top:
date_format = '%b%Y'
source_data_name = 'g4_RWA_sample_' + reporting_date_fmt + '.csv'
But of course this flags a warning, telling me reporting_date_fmt hasn't been created yet. Is there a workaround to this?
Define data name separately at the top of the file, then append the format and extension after the format has been defined.
data_name = 'g4_RWA_sample_'
...
source_data_name = data_name + reporting_date_fmt + '.csv'

Extract date range from csv files and writing them into new files, Nothing happens

Here is my code:
import os
import time
initial_date = '22.01.2015 02:00:00'
initial = time.mktime(time.strptime(initial_date, "%d.%m.%Y %H:%M:%S"))
final_date = '15.04.2015 03:45:00'
final = time.mktime(time.strptime(final_date, "%d.%m.%Y %H:%M:%S"))
path = 'Transfer\Praktikanten\2017-05-Sharon\M02_Modelldaten\Sofia_HW_032015_12\01.01.2015-31.12.2015_2014\22.02.2015-15.04.2015_201410XX'
directory = os.path.join("x:\\","path")
for root,dirs,files in os.walk(directory):
for files in directory:
if file.endswith(".csv"):
f_in=open(file, 'r').readlines()
Datum_Uhrzeit= []
Wasserstand= []
f_out = open('NEW_file','w')
f_out.write(f_in[0])
for i in range(1, len(f_in)):
Datum_Uhrzeit= f_in[i].split(';',)[0]
Wasserstand = f_in[i].split(';')[1]
Datum_Uhrzeit= time.mktime(time.strptime(Datum_Uhrzeit, "%d.%m.%Y %H:%M:%S"))
if initial <= Datum_Uhrzeit <= final:
f_out.write(f_in[i])
f_out.close()
I am trying to extract the given date time from all CSV files in the mentioned folder and write these data in a new_file.
Sample CSV file:
CSV file headers and data
The code returns no errors but it doesn't generate the new files.
So I managed to solve it. Here is the code if anyone needs in the future:
import os
import time
initial_date = '22.02.2015 02:00:00'
initial = time.mktime(time.strptime(initial_date, "%d.%m.%Y %H:%M:%S"))
final_date = '15.04.2015 03:45:00'
final = time.mktime(time.strptime(final_date, "%d.%m.%Y %H:%M:%S"))
path = 'x:/Transfer/Praktikanten/2017-05-Sharon/M02_Modelldaten/Sofia_HW_032015_12/01.01.2015-31.12.2015_2014/22.02.2015-15.04.2015_201410XX/'
for root,dirs,files in os.walk(path):
for file in files:
if file.endswith(".csv"):
f_out = open('NEW_' + file,'w')
f_in=open(file, 'r').readlines()
f_out.write(f_in[0])
for i in range(1, len(f_in)):
Datum_Uhrzeit= f_in[i].split(';',)[0]
Datum_Uhrzeit = time.mktime(time.strptime(Datum_Uhrzeit, "%d.%m.%Y %H:%M:%S"))
if initial <= Datum_Uhrzeit <= final:
f_out.write(f_in[i])
f_out.close()
Some of your variables contains a space. This is not allowed.
example: Datum Uhrzeit=> Datum_Uhrzeit (or even better: datum_uhrzeit).
Check PEP-8

Python - Batch combine Multiple large CSV, filter data, skip header, appending vertically into a single CSV

** Note i have modified the code below original to show a code that works for what i need
Good afternoon all
So many questions around csv data combining but as yet i haven't found anything to help me with my code requirements.
I have large fixed header CSV's that:
1) are produced over a 12hr period. i need to look up a weeks worth of csv's to merge
2) filter the individual CSV's on 2 columns information (to many rows otherwise)
3) append vertically into a single csv 'master sheet' with the naming convention 'date of the last shift'
** Files are coming out as individual CSV's. I need them to append into one
** FYI - Data set after code (there are 16 columns of data i just cut out for this purpose)
Below is what i have so far. apologies for the mess!
import os, csv
import pandas as pd
import io
import glob
from datetime import date
import time
import collections
# Process data and filter #
def ProcessData( data ):
processedData = []
for row in data:
if row[ 15 ] == ( 'OPERATING' ):
outputRow = row[ 0:3 ] + row[ 15:17 ]
processedData.append( outputRow )
return processedData
# Process and write #
def ProcessAndWrite( data, filename ):
processedData = ProcessData( data )
name, ext = os.path.splitext( filename )
outputfilename = name + '_week_combined.csv'
print "writing data to " + str( outputfilename )
with open(outputfilename, 'wb') as csvfile:
writer = csv.writer(csvfile)
for row in processedData:
writer.writerow(row)
# select the correct weeks worth of files #
def filedate( data, datetime ):
root = 'E:\Rs\\'
date_outputfilename_list = []
for file in date_outputfilename_list:
folder, file_name = os.path.split(file[1])
file_date = time.strftime("%y-%m-%d", file[0])
date_name_list.append((file_date, file_name))
date_count_dict = {}
date_name_dict = {}
for date, name in date_name_list:
date_count_dict=collections.defaultdict( int )
date_name_dict.setdefault(date, []).append(name)
import pprint
print("Files with the same date:")
pprint.pprint(date_name_dict)
print('-'*60)
print("Same dates count:")
pprint.pprint(date_count_dict)
# Function #
if __name__ == "__main__":
import sys
path = r'E:\Rs'
filenames = glob.glob(os.path.join(path, '*.csv'))
filenames.sort()
data = []
for filename in filenames:
with open(filename, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter = ',')
header = []
for headerCount in range( 2 ):
header.append(next(reader))
data.extend( [ row for row in reader ] )
if( filedate ):
ProcessAndWrite( data, filename )
data = [ProcessData]
if ( len( data ) > 0 ):
ProcessAndWrite( data, filename )
Data set:
position_x, position_y, position_z, start_time, opreason, stage,
header 2, header 2, header 2, header 2, header 2, header 2
649794, 4764274, 1147, 2/11/2016 00:00, OPERATING, sound,
Amended Script that works for my purpose
import os, csv # Import csv library
import io
import glob
import datetime
import time
import collections
def ProcessData( data ): # Function definition: filter data
processedData = [] # empty process data list
for row in data:
if (row[ 15 ] == 'OPERATING' and row[ 6 ] == 'truck'): # Filter explination
n1=datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S') # Strip date from timedate for duration calc
n2=datetime.datetime.strptime(row[4], '%Y-%m-%d %H:%M:%S') # Strip date from timedate for duration calc
diff = n2 - n1 # duration calc
outputRow = row[ 0:3 ] + row[ 3:5 ] + [diff.total_seconds()]
processedData.append( outputRow ) # process the last of the list information from the csv and append new file
return processedData # Final Processed data
def ProcessAndWrite( data, filename ): # Function Definition: Write data
processedData = ProcessData( data )
name, ext = os.path.splitext( filename ) # Split the file name from the original to define the output as weeks mastersheet
outputfilename = name + '_week_combined.csv'
print "writing data to " + str( outputfilename ) # Screen output describing file to look for
with open(outputfilename, 'wb') as csvfile: # 'wb' is write binary file
writer = csv.writer(csvfile) # Next line is a hack to put headers in the csv
writer.writerow(['position_x','position_y','position_z','start_time','end_time','model','number','speed','o','stage','duration', 'cummulative_duration'])
for row in processedData:
writer.writerow(row)
if __name__ == "__main__": # Run script directly through python (not imported)
import sys
path = r'E:\\' # Set correct folder location for file merge
filenames = glob.glob(os.path.join(path, '*.csv')) # Select correct files for merge
filenames.sort() # Sort the folder so that the files are in date order to make sure you dont crash the script later
data = [] # Blank data list
def dateFromFilename( name ): # Function to select the correct files from truck speed folder
path,filename = os.path.split(name)
splitName = filename.split('_')
dateStr = splitName[0]
date = datetime.datetime.strptime(dateStr,'%Y-%m-%d') # Split file name date and words
return date # Need to put this is so it returns an actual value!
firstFileDate = None
lastFilename = None
for filename in filenames: # Select file
currentFileDate = dateFromFilename( filename )
if firstFileDate:
diff = currentFileDate - firstFileDate
# somehow convert this to days
if ( diff.days >= 1 ): # Selct the previous 24hrs worth of data
ProcessAndWrite( data, lastFilename ) # Call function to write data
data = []
firstFileDate = currentFileDate
lastFilename = filename
with open(filename, 'r') as csvfile: # For new CSV files
reader = csv.reader(csvfile, delimiter = ',') # read the csv
header = [] # Blank header list (do this to skip the header rows for merge)
for headerCount in range( 3 ): # Start reading from line 3
header.append(next(reader))
data.extend( [ row for row in reader ] ) # extend is to continue the data stacking with the next csv data
if ( len( data ) > 0 ): # If the list of data has data then continue to process and write
ProcessAndWrite( data, filename )

How to get the yahoo finance csv directly into python

Does anybody know how to get yahoo finance csv directly into python?
The problem is that when i try to get the data with this (example) link:
http://real-chart.finance.yahoo.com/table.csv?s=WU&a=4&b=20&c=2015&d=05&e=21&f=2016&g=d&ignore=.csv'
It gives a pop-up asking if i want to download the csv-file. This causes it to bugg when i try to read it in to python. My scripts is:
today = datetime.date.today()
def get_url(stock='GOOG', START_date = str(int(str(today).split('-')[0])-1)+
'-' +str(int(str(today).split('-')[1])-1) + ('-') +
str(int(str(today).split('-')[2])-1), END_date= str(today)):
baseurl = 'http://real-chart.finance.yahoo.com/table.csv?'
stock = 's=WU'
FROM_date = ('&a=' + START_date.split('-')[1] + '&b=' +
START_date.split('-')[2] + '&c=' +
START_date.split('-')[0])
TO_date = ('&d=' + END_date.split('-')[1] + '&e=' +
END_date.split('-')[2] + '&f=' + END_date.split('-')[0])
url = baseurl + stock + FROM_date + TO_date + '&g=d&ignore=.csv'
return url
rawdate = []
with open(get_url()) as csvfile:
reader = csv.reader(csvfile, delimiter = ",")
for row in reader:
rawdata.append(row)
If i download the csv first i can read it into python, but I want to get to access the csv file directly without having to download it first. Is this possible? alternatively have the csv as temp.
Thanks!
I would recommend that you use pandas. Here is a link.
import pandas.io.data as web
import datetime
start = datetime.datetime(2010, 1, 1)
end = datetime.datetime(2013, 1, 27)
f = web.DataReader("F", 'yahoo', start, end)
f.ix['2010-01-04']
Out[6]:
Open 1.017000e+01
High 1.028000e+01
Low 1.005000e+01
Close 1.028000e+01
Volume 6.085580e+07
Adj Close 8.755953e+00
Name: 2010-01-04 00:00:00, dtype: float64
Try it this way.
in this file "C:/Users/your_path/Desktop/symbols/tickers.txt"
you have the following tickers
ibm
sbux
msft
"""
import urllib
import re
import json
symbolslist = open("C:/Users/rshuell001/Desktop/symbols/tickers.txt").read()
symbolslist = symbolslist.split("\n")
for symbol in symbolslist:
myfile = open("C:/Users/rshuell001/Desktop/symbols/" +symbol +".txt", "w+")
myfile.close()
htmltext = urllib.urlopen("http://www.bloomberg.com/markets/chart/data/1D/"+ symbol+ ":US")
data = json.load(htmltext)
datapoints = data["data_values"]
myfile = open("C:/Users/rshuell001/Desktop/symbols/" +symbol +".txt", "a")
for point in datapoints:
myfile.write(str(symbol+","+str(point[0])+","+str(point[1])+"\n"))
myfile.close()
That should give you what you want.

Categories

Resources