Python pd.read_csv, .to_sql different length than actual data - python

I have a database with ~ 50million rows. After reading to a database I only get 21,000 rows. What am I doing wrong? Thanks.
chunksize = 100000
csv_database = create_engine('sqlite:///csv_database.db', pool_pre_ping=True)
i=0
j=0
q=0
for df in pd.read_csv(filename, chunksize = chunksize, iterator = False):
# df = df.rename(columns={c: c.replace(' ', '') for c in df.columns})
df.index += j
i+= 1
df.to_sql('table', csv_database, if_exists='append')
j = df.index[-1] +1
q+=1
print("q: " + repr(q))
columnx = df.iloc[:,0]
columny = df.iloc[:,1]
columnz = df.iloc[:,2]
columnmass = df.iloc[:,3]
out: [21739 rows x 1 columns] etc etc.
in[19]: len(df)
Out[19]: 21739

'df' doesn't contain the entire csv file as you specified chunk size to 100000, and 21739 is the number of rows inserted in the last iteration.
If you do a count(1) of your table, I bet you'll get something like 5_21739.

Following code is working for me.
import numpy as np
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
DIR = 'C:/Users/aslams/Desktop/checkpoint/'
FILE = 'SUBSCRIBER1.csv'
file = '{}{}'.format(DIR, FILE)
csv_database = create_engine('sqlite:///csv_database.db')
chunksize = 10000
i = 0
j = 0
for df in pd.read_csv(file, chunksize=chunksize, iterator=True):
df = df.rename(columns= {c: c.replace(' ', '') for c in df.columns})
df.index +=3
df.to_sql('data_use', csv_database, if_exists = 'append')
j = df.index[-1]+1
print('| index: {}',format(j))

Related

Add new columns to a dataframe in for loop

I am able to write a for loop that adds a row to a dataframe each time as with the following example:
from random import randint
import numpy as np
dataframe = []
for i in range(2):
value = randint(0,10)
for j in range(2):
mean = np.mean(value)
dataframe.append(mean)
cols=['mean']
result=pd.DataFrame(dataframe, columns = cols)
result
This outputs a dataframe that looks like:
mean
8
8
9
9
How could I output a dataframe that looks like
mean_1 mean_2 mean_3 mean_4
8 8 9 9
I made the dataframe a pandas dataframe from the beginning. And then there are multiple ways to add an column. Add Column
from random import randint
import numpy as np
import pandas as pd
df = pd.DataFrame()
counter = 0
for i in range(2):
value = randint(0,10)
for j in range(2):
counter += 1
mean = np.mean(value)
column_name = "mean_" + str(counter)
df.loc[1, column_name] = mean
As Answer to the comment and I also moved the line where value is set. There it depends of course if you want the same number per column or everywhere a new number:
from random import randint
import numpy as np
import pandas as pd
df = pd.DataFrame()
for i in range(2):
for j in range(2):
value = randint(0,10)
mean = np.mean(value)
column_name = "mean_" + str(i + 1)
df.loc[j + 1, column_name] = mean
import pandas as pd
from random import randint
import numpy as np
m = 2
n = 2
dataframe = pd.DataFrame([0], columns = ['mean_1'])
for i in range(m):
value = randint(0,10)
for j in range(n):
mean = np.mean(value)
dataframe['mean_'+str(n*i+j+1)] = mean
I tried to keep what wrote by adding few details: a counter for columns, and a assembling of your final dataframe. This is not the most optimised way but I mentionned tried to keep your logic.
from random import randint
import numpy as np
import pandas as pd
dataframe = []
count = 1
cols = []
for i in range(2):
value = randint(0,10)
for j in range(2):
mean = np.mean(value)
dataframe.append(mean)
cols.append('mean_'+str(count))
count = count + 1
df=pd.DataFrame(columns=cols)
a_series = pd.Series(dataframe, index = cols)
d = df.append(a_series, ignore_index=True)
Output:

How to store the results of each iteration of for loop in a dataframe

cols = Germandata.columns
percentage_list = [0.05,0.01,0.1]
for i in range(len(Germandata)) :
for percentage in percentage_list:
columns_n = 3
random_columns = np.random.choice(cols, columns_n, replace=False)
local_data = Germandata.copy()
remove_n = int(round(local_data.shape[0] * percentage, 0))
for column_name in random_columns:
drop_indices = np.random.choice(local_data.index, remove_n, replace=False)
local_data.loc[drop_indices, column_name] = np.nan
The code here selects the columns at random and will delete certain percentage of observations from the data and it will replace them with NANs. The problem here is after running the loop i will get the final percentage deleted dataframe in the percentage list because it is overwriting after each iteration. How to store the dataframe with nans after each iteration.? Ideally i should get three dataframes with different percent of data deleted.
Try this
df_list = []
cols = Germandata.columns
percentage_list = [0.05,0.01,0.1]
for percentage in percentage_list:
columns_n = 3
random_columns = np.random.choice(cols, columns_n, replace=False)
local_data = Germandata.copy()
remove_n = int(round(local_data.shape[0] * percentage, 0))
for column_name in random_columns:
drop_indices = np.random.choice(local_data.index, remove_n, replace=False)
local_data.loc[drop_indices, column_name] = np.nan
local_data['percentage'] = percentage # optional
df_list.append(local_data)
df_05 = df_list[0]
df_01 = df_list[1]
df_1 = df_list[2]
Alternatively, you can use a dictionary
df_dict = {}
cols = Germandata.columns
percentage_list = [0.05,0.01,0.1]
for percentage in percentage_list:
columns_n = 3
random_columns = np.random.choice(cols, columns_n, replace=False)
local_data = Germandata.copy()
remove_n = int(round(local_data.shape[0] * percentage, 0))
for column_name in random_columns:
drop_indices = np.random.choice(local_data.index, remove_n, replace=False)
local_data.loc[drop_indices, column_name] = np.nan
local_data['percentage'] = percentage # optional
df_dict[str(percentage)] = local_data
df_05 = df_dict['0.05']
df_01 = df_dict['0.01']
df_1 = df_dict['0.1']

Pandas concatenate dataframes with for loop

I am trying to get tables from a website. The website's URL contains dates so I will have to iterate over dates in order to get historical data. I am generating dates as follows:
import datetime
start = datetime.datetime.strptime("26-09-2016", "%d-%m-%Y")
end = datetime.datetime.strptime("30-09-2016", "%d-%m-%Y")
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]
dates_list = []
for date in date_generated:
txt = str(str(date.day) + '.' + str(date.month) + '.' + str(date.year))
dates_list.append(txt)
dates_list
After this, I am running the code below to concatenate all the tables:
for i in range(0, 3):
allURL = 'https://www.uzse.uz/trade_results?date=' + dates_list[i] + '&locale=en&mkt_id=ALL&page=%d'
ndf_list = []
for i in range(1, 100):
url = allURL %i
if pd.read_html(url)[0].empty:
break
else :
ndf_list.append(pd.read_html(url)[0])
ndf = pd.concat(ndf_list)
ndf.insert(0, 'Date', dates_list[i])
mdf = pd.concat(ndf, ignore_index = True)
mdf
However, this does not work and I get:
TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"
I do not understand what I am doing wrong. I am expecting to have one table that comes from 26th, 27th, and 28th September.
Please help.
Not sure about the last line(s), but I'd approach it this way
import datetime
import pandas as pd
start = datetime.datetime.strptime("26-09-2016", "%d-%m-%Y")
end = datetime.datetime.strptime("30-09-2016", "%d-%m-%Y")
date_generated = [
start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]
dates_list = []
for date in date_generated:
txt = str(str(date.day) + '.' + str(date.month) + '.' + str(date.year))
dates_list.append(txt)
dates_list
ndf = pd.DataFrame() # create empty ndf
for i in range(0, 3):
allURL = 'https://www.uzse.uz/trade_results?date=' + \
dates_list[i] + '&locale=en&mkt_id=ALL&page=%d'
# ndf_list = []
for j in range(1, 100):
url = allURL % j
if pd.read_html(url)[0].empty:
break
else:
# ndf_list.append(pd.read_html(url)[0])
chunk = pd.read_html(url)[0]
chunk['Date'] = dates_list[i] # Date is positioned at last position, let's fix that
# get a list of all the columns
cols = chunk.columns.tolist()
# rearrange the columns, move the last element (Date) to the first position
cols = cols[-1:] + cols[:-1]
# reorder the dataframe
chunk = chunk[cols]
ndf = pd.concat([ndf, chunk])
# ndf = pd.concat(ndf_list)
# ndf.insert(0, 'Date', dates_list[i])
print(ndf)
# mdf = pd.concat(ndf, ignore_index=True)
# mdf

writing function in pandas/python

I have just started to learn python and don't have much of dev background. Here is the code I have written while learning.
I now want to make a function which exactly does what my "for" loop is doing but it needs to calculate different exp(exp,exp1 etc) based on different num(num, num1 etc)
how can I do this?
import pandas as pd
index = [0,1]
s = pd.Series(['a','b'],index= index)
t = pd.Series([1,2],index= index)
t1 = pd.Series([3,4],index= index)
df = pd.DataFrame(s,columns = ["str"])
df["num"] =t
df['num1']=t1
print (df)
exp=[]
for index, row in df.iterrows():
if(row['str'] == 'a'):
row['mul'] = -1 * row['num']
exp.append(row['mul'])
else:
row['mul'] = 1 * row['num']
exp.append(row['mul'])
df['exp'] = exp
print (df)
This is what i was trying to do which gives wrong results
import pandas as pd
index = [0,1]
s = pd.Series(['a','b'],index= index)
t = pd.Series([1,2],index= index)
t1 = pd.Series([3,4],index= index)
df = pd.DataFrame(s,columns = ["str"])
df["num"] =t
df['num1']=t1
def f(x):
exp=[]
for index, row in df.iterrows():
if(row['str'] == 'a'):
row['mul'] = -1 * x
exp.append(row['mul'])
else:
row['mul'] = 1 * x
exp.append(row['mul'])
return exp
df['exp'] = df['num'].apply(f)
df['exp1'] = df['num1'].apply(f)
df
Per suggestion below, I would do:
df['exp']=np.where(df.str=='a',df['num']*-1,df['num']*1)
df['exp1']=np.where(df.str=='a',df['num1']*-1,df['num1']*1)
I think you are looking for np.where
df['exp']=np.where(df.str=='a',df['num']*-1,df['num']*1)
df
Out[281]:
str num num1 exp
0 a 1 3 -1
1 b 2 4 2
Normal dataframe operation:
df["exp"] = df.apply(lambda x: x["num"] * (1 if x["str"]=="a" else -1), axis=1)
Mathematical dataframe operation:
df["exp"] = ((df["str"] == 'a')-0.5) * 2 * df["num"]

Take n rows from a spark dataframe and pass to toPandas()

I have this code:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).toPandas()
Works fine, does what it needs to. Suppose though I only want to display the first n rows, and then call toPandas() to return a pandas dataframe. How do I do it? I can't call take(n) because that doesn't return a dataframe and thus I can't pass it to toPandas().
So to put it another way, how can I take the top n rows from a dataframe and call toPandas() on the resulting dataframe? Can't think this is difficult but I can't figure it out.
I'm using Spark 1.6.0.
You can use the limit(n) function:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.limit(2).withColumn('age2', df.age + 2).toPandas()
Or:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df.withColumn('age2', df.age + 2).limit(2).toPandas()
You could get first rows of Spark DataFrame with head and then create Pandas DataFrame:
l = [('Alice', 1),('Jim',2),('Sandra',3)]
df = sqlContext.createDataFrame(l, ['name', 'age'])
df_pandas = pd.DataFrame(df.head(3), columns=df.columns)
In [4]: df_pandas
Out[4]:
name age
0 Alice 1
1 Jim 2
2 Sandra 3
Try it:
def showDf(df, count=None, percent=None, maxColumns=0):
if (df == None): return
import pandas
from IPython.display import display
pandas.set_option('display.encoding', 'UTF-8')
# Pandas dataframe
dfp = None
# maxColumns param
if (maxColumns >= 0):
if (maxColumns == 0): maxColumns = len(df.columns)
pandas.set_option('display.max_columns', maxColumns)
# count param
if (count == None and percent == None): count = 10 # Default count
if (count != None):
count = int(count)
if (count == 0): count = df.count()
pandas.set_option('display.max_rows', count)
dfp = pandas.DataFrame(df.head(count), columns=df.columns)
display(dfp)
# percent param
elif (percent != None):
percent = float(percent)
if (percent >=0.0 and percent <= 1.0):
import datetime
now = datetime.datetime.now()
seed = long(now.strftime("%H%M%S"))
dfs = df.sample(False, percent, seed)
count = df.count()
pandas.set_option('display.max_rows', count)
dfp = dfs.toPandas()
display(dfp)
Examples of usages are:
# Shows the ten first rows of the Spark dataframe
showDf(df)
showDf(df, 10)
showDf(df, count=10)
# Shows a random sample which represents 15% of the Spark dataframe
showDf(df, percent=0.15)

Categories

Resources