Import CSV to database using sqlalchemy - python

I am using this example to upload a csv file into a sqlite database:
this is my code:
from numpy import genfromtxt
from time import time
from datetime import datetime
from sqlalchemy import Column, Integer, Float, Date, String, VARCHAR
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
def Load_Data(file_name):
data = genfromtxt(file_name, delimiter=',')# skiprows=1, converters={0: lambda s: str(s)})
return data.tolist()
Base = declarative_base()
class cdb1(Base):
#Tell SQLAlchemy what the table name is and if there's any table-specific arguments it should know about
__tablename__ = 'cdb1'
__table_args__ = {'sqlite_autoincrement': True}
#tell SQLAlchemy the name of column and its attributes:
id = Column(Integer, primary_key=True, nullable=False)
name = Column(VARCHAR(40))
shack = Column(VARCHAR)
db = Column(Integer)
payments = Column(Integer)
status = Column(VARCHAR)
if __name__ == "__main__":
t = time()
print 'creating database'
#Create the database
engine = create_engine('sqlite:///cdb.db')
Base.metadata.create_all(engine)
#Create the session
session = sessionmaker()
session.configure(bind=engine)
s = session()
try:
file_name = 'client_db.csv'
data = Load_Data(file_name)
for i in data:
record = cdb1(**{
'name' : i[0],
'shack' : i[1],
'db' : i[2],
'payments' : i[3],
'status' : i[4]
})
s.add(record) #Add all the records
s.commit() #Attempt to commit all the records
except:
s.rollback() #Rollback the changes on error
print 'error in reading'
finally:
s.close() #Close the connection
print "Time elapsed: " + str(time() - t) + " s." #0.091s
and this is the first few rows of the csv file:
Name,Shack,DB,Payments,Status
Loyiso Dwala,I156,13542,37,LightsOnly ON
Attwell Fayo,I157,13077,32,LightsON
David Mbhele,G25,13155,33,LightsON
The DB is created ok, but only some of the data is captured into the attributes: the 'payments' and 'db' column are populated correctly, but everything else comes out as NULL.
UPDATED CORRECT CODE (using pandas dataframe):
from numpy import genfromtxt
from time import time
from datetime import datetime
from sqlalchemy import Column, Integer, Float, Date, String, VARCHAR
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import csv
import pandas as pd
#def Load_Data(file_name):
#data = csv.reader(file_name, delimiter=',')# skiprows=1, converters={0: lambda s: str(s)})
#return data.tolist()
Base = declarative_base()
class cdb1(Base):
#Tell SQLAlchemy what the table name is and if there's any table-specific arguments it should know about
__tablename__ = 'cdb1'
__table_args__ = {'sqlite_autoincrement': True}
#tell SQLAlchemy the name of column and its attributes:
id = Column(Integer, primary_key=True, nullable=False)
Name = Column(VARCHAR(40))
Shack = Column(VARCHAR)
DB = Column(Integer)
Payments = Column(Integer)
Status = Column(VARCHAR)
engine = create_engine('sqlite:///cdb.db')
Base.metadata.create_all(engine)
file_name = 'client_db.csv'
df = pd.read_csv(file_name)
df.to_sql(con=engine, index_label='id', name=cdb1.__tablename__, if_exists='replace')

Are you familiar with Pandas Dataframe?
Really simple to use (and debug)
pandas.read_csv(file_name)
In [5]: pandas.read_csv('/tmp/csvt.csv')
Out[5]:
Name Shack DB Payments Status
0 Loyiso Dwala I156 13542 37 LightsOnly ON
1 Attwell Fayo I157 13077 32 LightsON
2 David Mbhele G25 13155 33 LightsON
For inserting the DataFrames data into a table, you can simply use pandas.DataFrame.to_sql
So your main code will end up looking something like this:
engine = create_engine('sqlite:///cdb.db')
Base.metadata.create_all(engine)
file_name = 'client_db.csv'
df = pandas.read_csv(file_name)
df.to_sql(con=engine, index_label='id', name=cdb1.__tablename__, if_exists='replace')
You should read further in the documentation link I added, and set the function Parameters as suits your purpose (specially look at - if_exists, index, index_label, dtype)

Related

Search and Update datetime column with given time in mssql+pyodbc and sqlalchemy

I have been using MSSQL and pyodbc and updating a column with DateTime type by following the below thread:
How to update datetime field in MSSQL using python pyodbc module
Now I am also trying to incorporate sqlalchemy in my application stack. I am aware of following answer but it does not serve my purpose.
Datetime not updating on insert using SQLAlchemy on MSSQL
To elaborate on the problem:
from sqlalchemy import Column, String, DateTime, Date
from sqlalchemy.ext.declarative import declarative_base
import sqlalchemy
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from datetime import datetime as datetimemodule, date as datemodule
import traceback
BASE = declarative_base()
class REPR:
def make_repr(self, **kwrgs):
argument_string = ', '.join([f'{a}={b}' for a, b in locals()['kwrgs'].items()])
display_string = f'{self.__class__.__name__}({argument_string})'
return '*****' + display_string + '*****'
def __str__(self):
return self.__repr__()
class Recog_EntryLog(BASE, REPR):
# In database corresponding table has no primary key.
# As suggested in the documentation, largest candidate key is marked as
# primary key here.
__tablename__ = 'test'
id = Column(String(100), primary_key=True)
entrytime = Column(DateTime, primary_key=True) # I want to update this field by custom defined time.
# Not by the entry update time.
entrydate = Column(Date, primary_key=True)
def __repr__(self):
return self.make_repr(id=self.id, entrytime=self.entrytime, entrydate=self.entrydate)
def get_connection(dbname):
connection_info = f'mssql+pyodbc://remote:prashant#127.0.0.1:1433/{dbname}?driver=ODBC+Driver+17+for+SQL+Server'
engine = create_engine(connection_info, echo=True)
session = sessionmaker(bind=engine)()
return session, engine
def close_connection(connection_info:sqlalchemy.orm.session.Session,
engine:sqlalchemy.engine.base.Engine):
connection_info.close()
engine.dispose()
def search_and_update(connected_session:sqlalchemy.orm.session.Session,
id,
_time:datetimemodule,
_date:datemodule):
res = connected_session.query(Recog_EntryLog).filter_by(id=id, entrydate=_date).order_by(Recog_EntryLog.entrytime.desc()).first()
print(res)
try:
if res is None:
connected_session.add(Recog_EntryLog(id=id, entrytime=_time, entrydate=_date))
else:
query = connected_session.query(Recog_EntryLog).filter_by(id=id, entrytime=res.entrytime, entrydate=_date)
query.update({Recog_EntryLog.entrytime: _time})
connected_session.commit()
except:
traceback.print_exc()
connected_session.rollback()
if __name__ == '__main__':
connected_session, connected_engine = get_connection('authentication')
# Will create new entry on first run/Will update old entry
search_and_update(connected_session, '1234', datetimemodule.now(), datemodule.today())
# should update datetime
search_and_update(connected_session, '1234', datetimemodule.now(), datemodule.today())
# should update datetime
search_and_update(connected_session, '1234', datetimemodule.now(), datemodule.today())
close_connection(connected_session, connected_engine)
I was expecting that last two red values will be updated with the first two blue values. Generated SQL does not seem to follow first attach link sql.
Can some help me how to properly write such function using sqlalchemy?

Speed-up Sqlalchemy query with subqueries to check existence of related rows

I have query in SQLAlchemy that is structurally the same as the MWE below, which I am using to query a Sqlite DB.
#!/usr/bin/env python3
from timeit import timeit
import os.path
from tqdm import tqdm
from pprint import pprint
from datetime import date, timedelta
from random import choice, randint
from sqlalchemy import (
Column, Integer, Date, String, ForeignKey, sql, orm, create_engine)
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base
SAVE_INTERVAL = 365
START_DATE = date(2015, 1, 1)
DATE_RANGE = SAVE_INTERVAL * 5
NUM_AS = 10000
NAME_RANGE = NUM_AS // 4
NUM_BS = NUM_AS // 4
Base = declarative_base()
class A(Base):
__tablename__ = 'a'
id = Column(Integer, primary_key=True)
name = Column(String)
save_date = Column(Date)
def __init__(self, name, save_date):
self.name = name
self.save_date = save_date
bs = relationship('B', back_populates='a')
def __repr__(self):
return 'A({}, {})'.format(self.name,
self.save_date.strftime('%Y/%m/%d'))
class B(Base):
__tablename__ = 'b'
id = Column(Integer, primary_key=True)
a_id = Column(Integer, ForeignKey('a.id'))
# Relationships
a = relationship('A', back_populates='bs')
def __init__(self, a):
self.a = a
db_path = os.path.expanduser('~/test-query{}.db'.format(NUM_AS))
engine = create_engine('sqlite:///' + db_path)
Session = sessionmaker(bind=engine)
session = Session()
if not os.path.exists(db_path):
Base.metadata.create_all(engine)
print("Generating test data")
# Create dummy As and Bs data
a_list = []
for _ in tqdm(range(NUM_AS), "Generating As"):
a = A('name{:03}'.format(randint(1, NAME_RANGE)),
START_DATE + timedelta(days=randint(0, DATE_RANGE)))
a_list.append(a)
session.add(a)
session.commit()
for _ in tqdm(range(NUM_BS), "Generating Bs"):
session.add(B(choice(a_list)))
session.commit()
# Create query for sessions that still need to be reported
A_alias = orm.aliased(A)
query = (
session.query(A)
.filter(
# Filter out rows of the same name that have newer versions
~(session.query(A_alias.id)
.filter(
A_alias.name == A.name,
A_alias.save_date > A.save_date).exists()),
# Filter out rows that have older versions are referenced
# in table B that were saved less than SAVE_INTERVAL days earlier
~(session.query(A_alias.id)
.join(B) # Only select sessions with a report
.filter(
A_alias.name == A.name,
(sql.func.abs(
sql.func.julianday(A.save_date) -
sql.func.julianday(A_alias.save_date)) <=
SAVE_INTERVAL)).exists())))
print(query)
def count():
query.count()
print("Calculating exec time...")
print('{} s'.format(timeit(count, number=1)))
pprint("Number of returned records {}".format(count()))
It works fine, but is a bit slow on the production database (which is around 8,000 rows) making the web-page sluggish.
So I am wondering whether there is way I can speed it up, either by altering the SQL or how the ORM objects are generated?

Creating schema via declarative mapping: Base.metadata.create_all(engine) does not work

Here is an absurd problem with sqlalchemy that seems easy! First, this is my config file for connecting to mysql database:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
engine = create_engine('mysql://root:#localhost:3306/digi')
and then, I am trying to create a table called 'sale-history' :
from config import *
from sqlalchemy import *
class Sale(Base):
__tablename__ = 'sale-history'
order_id = column(Integer, primary_key= True)
customer_id = column(Integer)
item_id = column(Integer) #froeign key with product list
cartFinalize_dateTime = column(DATETIME)
amount_ordrered = column(Integer)
city_name = column(String(191))
quantity_ordered = column(Integer)
def __repr__(self):
return "<Sale(city_name='%s')>" % (self.city_name)
Sale.__table__
Base.metadata.create_all(engine)
Now, what I wonder is that
Sale.__table__
and
Base.metadata.create_all(engine)
are not known to my code. More accurate, these are not in suggestion options showed by pycharm editor. Debugging the code does not throw any error(returns 0). What should I do to create tables?
I appreciate your consideration so much!
The code is using column to define columns in the table but it should be using Column (note the upper-case "C").
A few tips/comments
Pycharm may provide better support if you avoid the from module import * idiom. You can alias module names if they are to long to type, for example import sqlalchemy as sa
You can see the SQL generated by the engine by passing echo=True to create_engine
Tablenames with hyphens need to be quoted with backticks to be valid. Sqlalchemy does this automatically, but other applications may not. Using underscores instead may be more convenient.
The final code might look like this:
config
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
engine = create_engine('mysql://root:#localhost:3306/test', echo=True)
Model
import sqlachemy as sa
import config
class Sale(Base):
__tablename__ = 'sale-history'
order_id = sa.Column(sa.Integer, primary_key=True)
customer_id = sa.Column(sa.Integer)
item_id = sa.Column(sa.Integer) # foreign key with product list
cartFinalize_dateTime = sa.Column(sa.DATETIME)
amount_ordrered = sa.Column(sa.Integer)
city_name = sa.Column(sa.String(191))
quantity_ordered = sa.Column(sa.Integer)
def __repr__(self):
return "<Sale(city_name='%s')>" % (self.city_name)
Base.metadata.create_all(config.engine)

SQLAlchemy parse a large file

How to parse a big file (50-100GB) into my database using SQLAlchemy? Let's say I have two tables.
import collections
import re
import Bio.SeqIO
import sqlalchemy
from sqlalchemy import ForeignKey, UniqueConstraint
from sqlalchemy import Column, Float, Integer, String, Text, DateTime
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.sql.expression import ClauseElement
Base = declarative_base()
class Protein_sequence(Base):
__tablename__ = 'protein_sequence'
prot_seq_id = Column(Integer, primary_key=True)
prot_seq = Column(Text, unique=True)
protein_annotation = relationship('Protein', back_populates='protein_sequence')
class Protein(Base):
__tablename__ = 'protein_annotation'
prot_id = Column(Integer, primary_key=True)
prot_seq_id = Column(Integer, ForeignKey('protein_sequence.prot_seq_id'))
prot_acc = Column(Text, unique=True)
prot_name = Column(Text)
protein_sequence = relationship('Protein_sequence', back_populates='protein_annotation')
def parse_fasta(path, prot_db='unknown', taxon_name=None, taxon_id=None):
"""Parsing a fasta file (UniProt or NCBInr)."""
prot = collections.OrderedDict()
for record in Bio.SeqIO.parse(path, 'fasta'):
prot['seq'] = str(record.seq)
desc = record.description
gi_num = re.findall('^gi\|([0-9]+)(?:\||\s|$)', desc)
if gi_num:
prot['prot_gi'] = gi_num[0]
desc = re.sub('^gi\|([0-9]+)(?:\||\s|$)', '', desc)
prot_db = re.findall('^([^|]+)\|', desc)
if prot_db:
prot_db = prot_db[0]
prot['prot_db'] = prot_db
prot_acc = re.findall('^[^|]+\|([^ ]+)', desc)[0]
prot['prot_acc'] = prot_acc
prot['prot_name'] = re.findall('^[^ ]+ (.+)', desc)[0]
yield prot
def prot_db_from_fasta():
"""Create tables in SQLite database. Input fasta file."""
db = 'sqlite:///proteomic.db'
engine = sqlalchemy.create_engine(db)
Base.metadata.create_all(engine)
Session = sqlalchemy.orm.sessionmaker(bind=engine)
session = Session()
conn = engine.connect()
p = 'prot.fasta'
for prot in parse_fasta(p):
# prot is dictionary storing info about protein
protein_sequence = Protein_sequence(prot_seq=prot['seq'])
session.add(protein_sequence)
try:
session.commit()
except:
session.rollback()
# choose only columns in table protein_annotation
cols = [c.name for c in Protein.__table__.columns]
annotation = {key: prot[key] for key in cols if key in prot}
annotation['prot_seq_id'] = protein_sequence.prot_seq_id
protein_annotation = Protein(**annotation)
session.add(protein_annotation)
session.commit()
conn.close()
# run function to insert data into database
prot_db_from_fasta()
The problem is that I need information about sequence ID (for annotation table) and in the same time insert sequence into database unless it is already there. Using SQLAlchemyCore will not help, the problem is I am using session commit in every for cycle. And it is very slow. If I use this script for 70MB big file, it takes 17 sec. If I use sqlite3 instead of SQLAlchemy it takes only 0.3 sec.
I know it is better to insert data in one big transaction but how to do it. I will not get back sequence ID to use them for my protein annotation.
There is also example of fasta file.
>gi|115646|sp|P02662.2|CASA1_BOVIN Alpha-S1-casein
MKLLILTCLVAVALARPKHPIKHQGLPQEVLNENLLRFFVAPFPEVFGKEKVNELSKDIGSESTEDQAME
DIKQMEAESISSSEEIVPNSVEQKHIQKEDVPSERYLGYLEQLLRLKKYKVPQLEIVPNSAEERLHSMKE
GIHAQQKEPMIGVNQELAYFYPELFRQFYQLDAYPSGAWYYVPLGTQYTDAPSFSDIPNPIGSENSEKTT
MPLW
>gi|115654|sp|P02663.2|CASA2_BOVIN Alpha-S2-casein
MKFFIFTCLLAVALAKNTMEHVSSSEESIISQETYKQEKNMAINPSKENLCSTFCKEVVRNANEEEYSIG
SSSEESAEVATEEVKITVDDKHYQKALNEINQFYQKFPQYLQYLYQGPIVLNPWDQVKRNAVPITPTLNR
EQLSTSEENSKKTVDMESTEVFTKKTKLTEEEKNRLNFLKKISQRYQKFALPQYLKTVYQHQKAMKPWIQ
PKTKVIPYVRYL
So what is good practice to insert data into database.
Picture of my proteomic database.

SQLAlchemy - How to select cerain rows?

How to simulate a SELECT in SQLAlchemy? I would like to create a function which takes a couple of parameters and returns a row which contains those values but I can't do SELECT.
The only way I found is below but I can't find metadata in SQLAlchemy module.
EDIT: I figured out that BoundMetaData is deprecated so MetaData is appropriate, but it says that Select has no len
# -*- coding: utf-8 -*-
import sqlalchemy
from sqlalchemy import Column, Table
from sqlalchemy import UniqueConstraint
from sqlalchemy import create_engine
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine('sqlite:///db.db', echo=False)
Base = declarative_base()
s = sqlalchemy.orm.Session(engine)
class Flight(Base):
__tablename__ = 'flights'
id = Column(sqlalchemy.Integer, primary_key=True)
destination_from = Column(sqlalchemy.String)
destination_to = Column(sqlalchemy.String)
creation_date = Column(sqlalchemy.Date)
start_date = Column(sqlalchemy.Date)
return_date = Column(sqlalchemy.Date)
price = Column(sqlalchemy.Float)
filename = Column(sqlalchemy.String)
bought_days_before = Column(sqlalchemy.Integer)
__table_args__ = (
UniqueConstraint('creation_date', 'destination_from', 'destination_to', 'start_date', 'return_date', 'price'),
)
Base.metadata.create_all(engine)
def insert_into_flights(**kwargs):
s.add(Flight(**kwargs))
try:
s.commit()
except IntegrityError as e:
s.rollback()
def get_prices(date_from, days, bought_days_before, destination, min=True, avg=False):
flights = Table('flights', metadata ,autoload=True
)
print len(flights.select())
s.query(Flight).filter(Flight.id==34).all()
This is an example selecting the Flight with id 34.
See SQLAlchemy docs

Categories

Resources