I am trying to read several (~30) csv-sheets i have stored on my PC.
i=2
Liste1 = []
Liste2 = []
x = 0
while i < 32:
string = str(i)
if i < 10:
try:
Name = 'D:\\FTPDaten\\2020\\Alle\\2020010'+string+'.csv'
Tabelle = pd.read_csv(Name, sep=';', decimal=",", header=0, usecols=[7, 20])
Tabelle.columns = ['AC', 'DC']
neueTabelle1 = Tabelle['AC']
neueTabelle = Tabelle['DC']
Schleifenlaenge = len(neueTabelle)
j = 0
del(Tabelle)
while j < Schleifenlaenge:
Datenwert1 = neueTabelle.iloc[j]
Datenwert2 = neueTabelle1.iloc[j]
Liste1.append(Datenwert1)
Liste2.append(Datenwert2)
j = j + 1
except FileNotFoundError:
i=i+1
elif i >= 10 and i < 32:
try:
Name = 'D:\\FTPDaten\\2020\\Alle\\202001' + string + '.csv'
Tabelle = pd.read_csv(Name, sep=';', decimal=",", header=0, usecols=[7, 20])
Tabelle.columns = ['AC', 'DC']
neueTabelle1 = Tabelle['AC']
neueTabelle = Tabelle['DC']
Schleifenlaenge = len(neueTabelle)
j = 0
while j < Schleifenlaenge:
Datenwert1 = neueTabelle1.iloc[j]
Datenwert2 = neueTabelle.iloc[j]
Liste1.append(Datenwert1)
Liste2.append(Datenwert2)
j = j + 1
except FileNotFoundError:
i = i+1
i = i + 1
When the while-loop is running for the first time everything works fine. When it comes into the first iteration, the read_csv does not read the file like it did before. I would expect to get a DataFrame where the 7th and 20th column is stored. But i do get a DataFrame with no content at all - just the header.
I tried a lot, but certainly i canĀ“t fix it.
The issue was with how it was reading in the ';' in the other files (except the first one). If you open them in excel, you might be able to see what I'm talking about. So what you'll need to do is skip those rows at the beginning of the file.
import pandas as pd
Liste1 = []
Liste2 = []
for i in range(2,32):
skipRows = 7
if i != 2:
skipRows += 1
if i < 10:
try:
Name = 'D:\\FTPDaten\\2020\\Alle\\2020010{string}.csv'.format(string=i)
Tabelle = pd.read_csv(Name, sep=';', decimal=",", header=0, usecols=[7, 20], skiprows=skipRows)
Tabelle.columns = ['AC', 'DC']
if i < 10:
Datenwert1 = list(Tabelle['DC'])
Datenwert2 = list(Tabelle['AC'])
elif i >= 10 and i < 32:
Datenwert1 = list(Tabelle['AC'])
Datenwert2 = list(Tabelle['DC'])
Liste1 += Datenwert1
Liste2 += Datenwert2
except FileNotFoundError as e:
print(e)
df = pd.DataFrame({'col1':Datenwert1, #<-- change 'col1', 'col2' to whatever you want to name them
'col2':Datenwert2})
Try creating a new dataframe here instead of iterate over the existing one
cols = ['AC', 'DC']
new_Tabelle = pd.DataFrame(columns = cols)
new_Tabelle['AC']=Tabelle['AC']
new_Tabelle['DC']=Tabelle['DC']
I don't have 30 semi-colon delimited files. However this can be so easily simplified to only pick up files that exist and match a pattern using glob
import pandas as pd
from pathlib import Path
import random
for i in range(30):
with open(f"2020010_os_{i}.csv", "w") as fp: fp.write(f"id;val\n{i};{random.randint(10,20)}\n")
pd.concat([pd.read_csv(fn, sep=";") for fn in Path().cwd().glob("2020010*.csv")])
Related
How can I copy a row for example from D51 to F51 and paste these values in the row T20 to AF20.
I know how to load a spreadsheet
workbook = load_workbook(output)
sheet = workbook.active
But I dont know how to itenarate in a loop to get this
sheet["T2"] = "=D6"
sheet["U2"] = "=E6"
sheet["V2"] = "=F6"
sheet["W2"] = "=G6"
sheet["X2"] = "=H6"
sheet["Y2"] = "=I6"
sheet["Z2"] = "=J6"
sheet["AA2"] = "=K6"
sheet["AB2"] = "=L6"
sheet["AC2"] = "=M6"
sheet["AD2"] = "=N6"
sheet["AE2"] = "=O6"
sheet["AF2"] = "=P6"
You can achieve this by using code below...
Note that the file output.xlsx is opened, updated and saved. The function num_to_excel_col is borrowed from here.
This will update columns 20 (T) onwards for the next 15 columns (all row 2) with the text as "=D6", "=E6", etc. The num_to_col function will convert the col number to equivalent excel string (for eg. 27 will be converted to AA, etc.)
import pandas as pd
import numpy as np
import openpyxl
workbook = openpyxl.load_workbook('output.xlsx')
ws = workbook.active
def num_to_excel_col(n):
if n < 1:
raise ValueError("Number must be positive")
result = ""
while True:
if n > 26:
n, r = divmod(n - 1, 26)
result = chr(r + ord('A')) + result
else:
return chr(n + ord('A') - 1) + result
outcol = 4 #Paste in col 'D'
for col in range(20,35): #col 20 is T and doing this for next 15 columns
txt = "="+num_to_excel_col(outcol)+"6"
print(txt)
ws.cell(row=2, column=col).value = txt
outcol += 1
workbook.save("output.xlsx")
I am trying to split up a json file from alpha-vantages api into separate files depending on the date. I'm also trying to reformat the file to have blank values in the gaps where dates are missing. The following code is what I have come up with but it gives me the TypeError: 'list' object is not callable". I'm fairly new to python and pandas so I'm sure there is a better way to go about this.
import requests
import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
from pandas import DataFrame
import json
symbol = "MSFT"
symbol_list = symbol.split(",")
def num_el(list):
count = 0
for element in list:
count += 1
return count
def csv_make(sy, dar, dat):
csv_file = open(f"{sy}_1min_{dar}.csv", "w", newline="")
csv_file.write(dat)
csv_file.close()
i = 0
x = -1
n = num_el(symbol_list)
while i < n:
namesym = symbol_list[x]
ticker = namesym
api_key = 'APIKEYHERE'
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&outputsize=full&interval=1min&apikey={api_key}'
data = requests.get(url)
dsf = data.json()
daf = pd.DataFrame(dsf['Time Series (1min)'])
dxf: DataFrame = daf.T
dxf.index.name = 'time'
dxf.reset_index(inplace=True)
dxf['time'] = pd.to_datetime(dxf['time'])
dxf['minute'] = dxf['time'].dt.time
dxf['day'] = dxf['time'].dt.day
dxf['date'] = dxf['time'].dt.date
agg = dxf.groupby([dxf['day']])
length1 = dxf.groupby([dxf['day']]).size()
length = pd.DataFrame(length1)
length.index.name = 'day'
length.reset_index(inplace=True)
length_sum = length[0].sum()
v = 0
d = length_sum
b = len(length)
x2 = length_sum
while v < b:
a = length[0][v]
x2 -= length[0][v]
xd = agg.get_group(length['day'][v])
date = xd['date'][x2]
max_dt = parser.parse(str(max(xd['minute'])))
min_dt = parser.parse(str(min(xd['minute'])))
dt_range = []
while min_dt <= max_dt:
dt_range.append(min_dt.strftime("%H:%M:%S"))
min_dt += timedelta(seconds=60)
complete_df = pd.DataFrame({'minute': dt_range})
xy = complete_df.astype('str')
yx = xd.astype('str')
dasf = xy.merge(yx, how='left', on='minute')
dasf['ev'] = np.where(dasf['1. open'].notnull(), 'False', 'True')
time = []
open = []
high = []
low = []
close = []
volume = []
empty_value = []
for ib in range(len(dasf)):
time.append(dasf['minute'][ib])
open.append(dasf['1. open'][ib])
high.append(dasf['2. high'][ib])
low.append(dasf['3. low'][ib])
close.append(dasf['4. close'][ib])
volume.append(dasf['5. volume'][ib])
empty_value.append(dasf['ev'][ib])
time_df = pd.DataFrame(time).rename(columns={0: 'Time'})
open_df = pd.DataFrame(open).rename(columns={0: 'Open'})
high_df = pd.DataFrame(high).rename(columns={0: 'High'})
low_df = pd.DataFrame(low).rename(columns={0: 'Low'})
close_df = pd.DataFrame(close).rename(columns={0: 'Close'})
volume_df = pd.DataFrame(volume).rename(columns={0: 'Volume'})
empty_value_df = pd.DataFrame(empty_value).rename(columns={0: 'Empty Value'})
frames = [time_df, open_df, high_df, low_df, close_df, volume_df, empty_value_df]
df = pd.concat(frames, axis=1, join='inner')
df = df.set_index('Time')
ad = df.to_csv()
csv_make(namesym, date, ad)
v += 1
i += 1
I wrote a script that searches an excel document for 'X', and when it finds an 'X' it copies the first column and first row associated with the 'X' into a CSV file.
I've been told that there's a better way to do this with 'if' statements. Not quite sure how.
Here's the code:
import xlrd
import csv
###Grab the data from sheet 1
def get_row_values(workSheet, row):
to_return = []
num_cells = myWorksheet.ncols - 1
curr_cell = -1
while curr_cell < num_cells:
curr_cell += 1
cell_value = myWorksheet.cell_value(row, curr_cell)
to_return.append(cell_value)
return to_return
file_path = 'foo.xlsx'
output = []
#Write the data
myWorkbook = xlrd.open_workbook(file_path)
myWorksheet = myWorkbook.sheet_by_name('foosheet')
num_rows = myWorksheet.nrows - 1
curr_row = 0
column_names = get_row_values(myWorksheet, curr_row)
#print("TOTAL ENTRIES:")
#print len(column_names)
#print("-----")
framework_name = myWorksheet.cell(0,2)
framework_version = myWorksheet.cell(0,3)
while curr_row < num_rows:
curr_row += 1
row = myWorksheet.row(curr_row)
this_row = get_row_values(myWorksheet, curr_row)
x = 0
while x <len(this_row):
if this_row[x] == 'x':
output.append(['', fooname, foo_version,
foo_names[x], foo_row[0]])
myData = [["foo1", "foo2",
"foo3", "foo4", "foo5"]]
myFile = open('./results/barTemp.csv', 'w')
with myFile:
writer = csv.writer(myFile)
writer.writerows(myData)
writer.writerows(output)
x += 1
#print output
myFile.close()
myWorkbook.release_resources()
Its not necessarily better. Still the same runtime-complexity.
The difference would be a more compact line:
For example, you can change
while x < len(this_row):
to
for x in this_row:
but I see that you use the 'x' index to find column_names[x] so another approach might be better such as
for x in range(len(this_row)):
I have some issues while reading txt files. What i have to do is read files ( about 360 ) and make a plot. Everything works except when there is a special character in my file such us: "". When my reading function finds that character it crashes. Is there any way to skip it? My code:
import os
import matplotlib.pyplot as plt
import numpy as np
i = 10
j = 0
X = []
Y = []
Z = []
k = 0
A = np.zeros([360,719])
for i in range(10,360,10):
X = []
Y = []
if len(str(i)) == 2:
data = open(dir + '\\150317_ScPONd_0%s_radio.txt'%i, 'r')
else:
data = open(dir + '\\150317_ScPONd_%s_radio.txt'%i, 'r')
z = data.readlines()
data.close()
for line in z:
if not line.startswith('$'):
data_2 = line.split('\t')
X.append(data_2[0])
Y.append(data_2[1])
A[j,:] = X
A[(j+1),:] = Y
And here is how my file looks like:
Is there any way to skip those "$" lines? Sorry for that picture, I have no idea how to attache It better.
Thaks to #user1753919 I have found an answer. If someone would be still interested in this, here is working code:
for i in range(10,360,10):
X = []
Y = []
if len(str(i)) == 2:
data = np.genfromtxt(dir + '\\150317_ScPONd_0%s_radio.txt'%i,skip_header = 12)
else:
data = np.genfromtxt(dir + '\\150317_ScPONd_%s_radio.txt'%i,skip_header = 12)
for line in data:
X.append(line[0])
Y.append(line[1])
A[j,:] = X
A[(j+1),:] = Y
plt.plot(A[j,:],A[(j+1),:],label = '{} K'.format(i))
plt.hold
j = j+2
genfromtxt is overkill.
np.loadtxt(file, comments='$')
Hello now im working on my project. I want to get candidate of text block by using algorithm below.
My input is a csv document which contain :
HTML column : the html code in a line
TAG column : the tag of html code in a line
Words : the text inside the tag in aline
TC : the number of words in a line
LTC : the number of anchor words in a line
TG : the number of tag in a line
P : the number of tag p and br in a line
CTTD : TC + (0.2*LTC) + TG - P
CTTDs : the smoothed CTTD
This is my algorithm to find candidate of text block. I make the csv file into dataframe using pandas. I am using CTTDs,TC and TG column to find the candidate.
from ListSmoothing import get_filepaths_smoothing
import pandas as pd
import numpy as np
import csv
filenames = get_filepaths_smoothing(r"C:\Users\kimhyesung\PycharmProjects\newsextraction\smoothing")
index = 0
for f in filenames:
file_html=open(str(f),"r")
df = pd.read_csv(file_html)
#df = pd.read_csv('smoothing/Smoothing001.csv')
news = np.array(df['CTTDs'])
new = np.array(df['TG'])
minval = np.min(news[np.nonzero(news)])
maxval = np.max(news[np.nonzero(news)])
j = 0.2
thetaCTTD = minval + j * (maxval-minval)
#maxGap = np.max(new[np.nonzero(new)])
#minGap = np.min(new[np.nonzero(new)])
thetaGap = np.min(new[np.nonzero(new)])
#print thetaCTTD
#print maxval
#print minval
#print thetaGap
def create_candidates(df, thetaCTTD, thetaGAP):
k = 0
TB = {}
TC = 0
for index in range(0, len(df) - 1):
start = index
if df.ix[index]['CTTDs'] > thetaCTTD:
start = index
gap = 0
TC = df.ix[index]['TC']
for index in range(index + 1, len(df) - 1):
if df.ix[index]['TG'] == 0:
continue
elif df.ix[index]['CTTDs'] <= thetaCTTD and gap >= thetaGAP:
break
elif df.ix[index]['CTTDs'] <= thetaCTTD:
gap += 1
TC += df.ix[index]['TC']
if (TC < 1) or (start == index):
continue
TB.update({
k: {
'start': start,
'end': index - 1
}
})
k += 1
return TB
def get_unique_candidate(TB):
TB = tb.copy()
for key, value in tb.iteritems():
if key == len(tb) - 1:
break
if value['end'] == tb[key+1]['end']:
del TB[key+1]
elif value['start'] < tb[key+1]['start'] < value['end']:
TB[key]['end'] = tb[key+1]['start'] - 1
else:
continue
return TB
index += 1
stored_file = "textcandidate/textcandidate" + '{0:03}'.format(index) + ".csv"
tb = create_candidates(df, thetaCTTD, thetaGap)
TB = get_unique_candidate(tb)
filewrite = open(stored_file, "wb")
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
output_df.to_csv(stored_file)
writer = csv.writer(filewrite, lineterminator='\n')
filewrite.close
ThetaCTTD is 10.36 and thethaGap is 1.
The output is
The output means there are 2 candidates of text block . First the candiate of text block start from line number 215 and end line number 225 (like the pict bellow). And the other candidate of text block start from line number 500 and end line number 501.
My question is how to save the output into csv and not only the number of line but the range of the text block and the others column will appear as the output too?
My expected output is like the screenshot of candidate text block is like this one
Assuming your output is a list of dictionaries:
pd.concat([df.loc[d['start']:d['end']] for (k, d) in TB.iteritems()])
Note that we slice by label, so d['end'] will be included.
Edit: add the candidate number in a new column.
It's cleaner to write a loop than to do two concat operations:
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
It's also faster to concatenate all dataframes at once at the end.