I am trying to get a list of files in a user specified directory to be saved to a database. What I have at the moment is :
import os
import sqlite3
def get_list():
folder = input("Directory to scan : ")
results = []
for path in os.listdir(folder):
if os.path.isfile(os.path.join(folder, path)):
results.append(path)
print(results)
return results
def populate(results):
connection = sqlite3.connect("videos.db")
with connection:
connection.execute("CREATE TABLE IF NOT EXISTS files (id INTEGER PRIMARY KEY, file_name TEXT);")
for filename in results:
insert_string = "INSERT INTO files (file_name) VALUES ('"+filename+"');"
connection.execute(insert_string)
filelist = get_list()
populate(filelist)
It runs without a problem and prints out a list of the file names, which is great, but then when it's running the INSERT SQL statement, that seems to have no effect on the database table. I have tried to debug it, and the statement which is saved in the variable looks good, and when executing it manually in the console, it inserts a row in the table, but when running it, nothing changes. Am I missing something really simple here ?
Python's SQLite3 module doesn't auto-commit by default, so you need to call connection.commit() after you've finished executing queries. This is covered in the tutorial.
In addition, use ? placeholders to avoid SQL injection issues:
cur.execute('INSERT INTO files (file_name) VALUES (?)', (filename,))
Once you do that, you can insert all of your filenames at once using executemany:
cur.executemany(
'INSERT INTO files (file_name) VALUES (?)',
[(filename,) for filename in results],
)
Related
I'm using Python. I have a daily csv file that I need to copy daily into a postgresql table. Some of those .csv records may be same day over day so I want to ignore those, based on a primary key field. Using cursor.copy_from,Day 1 all is fine, new table created. Day 2, copy_from throws duplicate key error (as it should), but copy_from stops on 1st error. Is there a copy_from parameter that would ignore the duplicates and continue? If not, any other recommendations other than copy_from?
f = open(csv_file_name, 'r')
c.copy_from(f, 'mytable', sep=',')
This is how I'm doing it with psycopg3.
Assumes the file is in the same folder as the script and that it has a header row.
from pathlib import Path
from psycopg import sql
file = Path(__file__).parent / "the_data.csv"
target_table = "mytable"
conn = <your connection>
with conn.cursor() as cur:
# Create an empty table with the same columns as target_table.
cur.execute(f"CREATE TEMP TABLE tmp_table (LIKE {target_table})")
# The csv file imports as text.
# This approach tells postgres how to convert text to the proper column types.
column_types = sql.Identifier(target_table)
query = sql.SQL("COPY tmp_table FROM STDIN WITH(FORMAT csv, HEADER true)")
typed_query = query.format(column_types)
with cur.copy(typed_query) as copy:
with file.open() as csv_data:
copy.write(csv_data.read())
cur.execute(
f"INSERT INTO {target_table} SELECT * FROM tmp_table ON CONFLICT DO NOTHING"
)
I have a table in my PostgreSQL database in which a column type is set to bytea in order to store zipped files.
The storing procedure works fine. I have problems when I need to retrieve the zipped file I uploaded.
def getAnsibleByLibrary(projectId):
con = psycopg2.connect(
database="xyz",
user="user",
password="pwd",
host="localhost",
port="5432",
)
print("Database opened successfully")
cur = con.cursor()
query = "SELECT ansiblezip FROM library WHERE library.id = (SELECT libraryid from project WHERE project.id = '"
query += str(projectId)
query += "')"
cur.execute(query)
rows = cur.fetchall()
repository = rows[0][0]
con.commit()
con.close()
print(repository, type(repository))
with open("zippedOne.zip", "wb") as fin:
fin.write(repository)
This code creates a zippedOne.zip file but it seems to be an invalid archive.
I tried also saving repository.tobytes() but it gives the same result.
I don't understand how I can handle memoriview objects.
If I try:
print(repository, type(repository))
the result is:
<memory at 0x7f6b62879348> <class 'memoryview'>
If I try to unzip the file:
chain#wraware:~$ unzip zippedOne.zip
The result is:
Archive: zippedOne.zip
End-of-central-directory signature not found. Either this file is not
a zipfile, or it constitutes one disk of a multi-part archive. In the
latter case the central directory and zipfile comment will be found on
the last disk(s) of this archive.
unzip: cannot find zipfile directory in one of zippedOne.zip or
zippedOne.zip.zip, and cannot find zippedOne.zip.ZIP, period.
Trying to extract it in windows gives me the error: "The compressed (zipped) folder is invalid"
This code, based on the example in the question, works for me:
import io
import zipfile
import psycopg2
DROP = """DROP TABLE IF EXISTS so69434887"""
CREATE = """\
CREATE TABLE so69434887 (
id serial primary key,
ansiblezip bytea
)
"""
buf = io.BytesIO()
with zipfile.ZipFile(buf, mode='w') as zf:
zf.writestr('so69434887.txt', 'abc')
with psycopg2.connect(database="test") as conn:
cur = conn.cursor()
cur.execute(DROP)
cur.execute(CREATE)
conn.commit()
cur.execute("""INSERT INTO so69434887 (ansiblezip) VALUES (%s)""", (buf.getvalue(),))
conn.commit()
cur.execute("""SELECT ansiblezip FROM so69434887""")
memview, = cur.fetchone()
with open('so69434887.zip', 'wb') as f:
f.write(memview)
and is unzippable (on Linux, at least)
$ unzip -p so69434887.zip so69434887.txt
abc
So perhaps the data is not being inserted correctly.
FWIW I got the "End-of-central-directory signature not found" until I made sure I closed the zipfile object before writing to the database.
I am using Python 3.6 to iterate through a folder structure and return the file paths of all these CSVs I want to import into two already created Oracle tables.
con = cx_Oracle.connect('BLAH/BLAH#XXX:666/BLAH')
#Targets the exact filepaths of the CSVs we want to import into the Oracle database
if os.access(base_cust_path, os.W_OK):
for path, dirs, files in os.walk(base_cust_path):
if "Daily" not in path and "Daily" not in dirs and "Jul" not in path and "2017-07" not in path:
for f in files:
if "OUTPUT" in f and "MERGE" not in f and "DD" not in f:
print("Import to OUTPUT table: "+ path + "/" + f)
#Run function to import to SQL Table 1
if "MERGE" in f and "OUTPUT" not in f and "DD" not in f:
print("Import to MERGE table: "+ path + "/" + f)
#Run function to import to SQL Table 2
A while ago I was able to use PHP to produce a function that used the BULK INSERT SQL command for SQL Server:
function bulkInserttoDB($csvPath){
$tablename = "[DATABASE].[dbo].[TABLE]";
$insert = "BULK
INSERT ".$tablename."
FROM '".$csvPath."'
WITH (FIELDTERMINATOR = ',', ROWTERMINATOR = '\\n')";
print_r($insert);
print_r("<br>");
$result = odbc_prepare($GLOBALS['connection'], $insert);
odbc_execute($result)or die(odbc_error($connection));
}
I was looking to replicate this for Python, but a few Google searches left me to believe there is no 'BULK INSERT' command for Oracle. This BULK INSERT command had awesome performance.
Since these CSVs I am loading are huge (2GB x 365), performance is crucial. What is the most efficient way of doing this?
The bulk insert is made using the cx_oracle library and the commands
con = cx_Oracle.connect(CONNECTION_STRING)
cur= con.cursor()
cur.prepare("INSERT INTO MyTable values (
to_date(:1,'YYYY/MM/DD HH24:MI:SS'),
:2,
:3,
to_date(:4,'YYYY/MM/DD HH24:MI:SS'),
:5,
:6,
to_date(:7,'YYYY/MM/DD HH24:MI:SS'),
:8,
to_date(:9,'YYYY/MM/DD HH24:MI:SS'))"
) ##prepare your statment
list.append((sline[0],sline[1],sline[2],sline[3],sline[4],sline[5],sline[6],sline[7],sline[8])) ##prepare your data
cur.executemany(None, list) ##insert
you prepare an insert statement. Then you store your file and your list. finally you execute the many. It will paralyze everything.
I will bring below a function code snippet which I can't make to work correct way.
def upload_csv():
conn = sqlite3.connect("data.db")
cursor = conn.cursor()
#as far as tkFileDialog returns absolute path to file, we have to slice from last slash till the end
filename = fn[fn.rfind("/")+1:]
cursor.execute("CREATE TABLE IF NOT EXISTS {0}('MSISDN' INTEGER PRIMARY KEY, 'IMEI' TEXT, 'TAC' INTEGER );".format(filename))
reader = csv.reader(open(fn,'r'))
for row in reader:
to_db = [unicode(row[0]),unicode(row[1]),unicode(int(row[2][0:8]))]
print to_db
cursor.execute("INSERT INTO data.{0} (MSISDN,IMEI,TAC) VALUES (?,?,?);".__format__(filename), to_db)
conn.commit()
I receive an Operational error:
OperationalError: unknown database May2015
So guys, I found the problem.
In my code i didn't strip the .csv file extention and that was the problem.
Thanks to CL for his advice to look in the deep to the name of the file.
For those who stacks on a similar problem, the right code is:
#as far as tkFileDialog returns absolute path to file, we have to slice from last slash till the end and also strip the extention!
filename = fn[fn.rfind("/")+1:].strip('.csv')
I have a CSV file without headers and am trying to create a SQL table from certain columns in the file. I tried the solutions given here: Importing a CSV file into a sqlite3 database table using Python,
but keep getting the error that col1 is not defined. I then tried inserting headers in my CSV file and am still getting a KeyError.
Any help is appreciated! (I am not very familiar with SQL at all)
If the .csv file has no headers, you don't want to use DictReader; DictReader assumes line 1 is a set of headers and uses them as keys for every subsequent line. This is probably why you're getting KeyErrors.
A modified version of the example from that link:
import csv, sqlite3
con = sqlite3.connect(":memory:")
cur = con.cursor()
cur.execute("CREATE TABLE t (col1, col2);")
with open('data.csv','rb') as fin:
dr = csv.reader(fin)
dicts = ({'col1': line[0], 'col2': line[1]} for line in dr)
to_db = ((i['col1'], i['col2']) for i in dicts)
cur.executemany("INSERT INTO t (col1, col2) VALUES (?, ?);", to_db)
con.commit()
This below code, will read all the csv files from the path and load all the data into table present in sqllite 3 database.
import sqllite3
import io
import os.path
import glob
cnx = sqlite3.connect(user='user', host='localhost', password='password',
database='dbname')
cursor=cnx.cursor(buffered= True);
path ='path/*/csv'
for files in glob.glob(path + "/*.csv"):
add_csv_file="""LOAD DATA LOCAL INFILE '%s' INTO TABLE tabkename FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' IGNORE 1 LINES;;;""" %(files)
print ("add_csv_file: %s" % files)
cursor.execute(add_csv_file)
cnx.commit()
cursor.close();
cnx.close();
Let me know if this works.