Good way to read csvData using psycopg2 - python

I am trying to get a fast i.e. fast and not a lot of code, way to get csv data into postgres data base. I am reading into python using csvDictreader which works fine. Then I need to generate code somehow that takes the dicts and puts it into a table. I want to do this automaticaly as my tables often have hundreds of variables. (I don't want to read directly to Postgres because in many cases I must transform the data and python is good for that)
This is some of what I have got:
import psycopg2
import sys
import itertools
import sys, csv
import psycopg2.extras
import psycopg2.extensions
csvReader=csv.DictReader(open( '/home/matthew/Downloads/us_gis_data/statesp020.csv', "rb"), delimiter = ',')
#close.cursor()
x = 0
ConnectionString = "host='localhost' dbname='mydb' user='postgres' password='######"
try:
connection = psycopg2.extras.DictConnection(ConnectionString)
print "connecting"
except:
print "did not work"
# Create a test table with some data
dict_cur = connection.cursor()
#dict_cur.execute("CREATE TABLE test (id serial PRIMARY KEY, num integer, data varchar);")
for i in range(1,50):
x = x+1
print x
dict_cur.execute("INSERT INTO test (num, data) VALUES(%s, %s)",(x, 3.6))#"abc'def"))
### how to I create the table and insert value using the dictreader?
dict_cur.execute("SELECT * FROM test")
for k in range(0,x+1):
rec = dict_cur.fetchone()
print rec['num'], rec['data']

Say you have a list of field names (presumably you can get this from the header of your csv file):
fieldnames = ['Name', 'Address', 'City', 'State']
Assuming they're all VARCHARs, you can create the table "TableName":
sql_table = 'CREATE TABLE TableName (%s)' % ','.join('%s VARCHAR(50)' % name for name in fieldnames)
cursor.execute(sql_table)
You can insert the rows from a dictionary "dict":
sql_insert = ('INSERT INTO TableName (%s) VALUES (%s)' %
(','.join('%s' % name for name in fieldnames),
','.join('%%(%s)s' % name for name in fieldnames)))
cursor.execute(sql_insert, dict)
Or do it in one go, given a list dictionaries:
dictlist = [dict1, dict2, ...]
cursor.executemany(sql_insert, dictlist)
You can adapt this as necessary based on the type of your fields and the use of DictReader.

I am a novice but this worked for me. I used PG Admin to create the 'testCSV' table.
import psycopg2 as dbapi
con = dbapi.connect(database="testpg", user="postgres", password="secret")
cur = con.cursor()
import csv
csvObject = csv.reader(open(r'C:\testcsv.csv', 'r'), dialect = 'excel', delimiter = ',')
passData = "INSERT INTO testCSV (param1, param2, param3, param4, param5) VALUES (%s,%s,%s,%s,%s);"
for row in csvObject:
csvLine = row
cur.execute(passData, csvLine)
con.commit()

Related

Insert data from csv to postgreSQL database via Python

I'm brand new to postgreSQL or SQL at all.
I'm trying to create a table in a database via Python and then load data from a .csv file into the table.
My code looks like this:
import csv
import psycopg2
#Establish connection to database
con = psycopg2.connect(
host = "localhost",
database = "kundeavgang",
user = "postgres",
password = "postgres",
)
#Cursor
cur = con.cursor()
#If a mistake is made, start from scratch
cur.execute("DROP TABLE IF EXISTS kundeavgang")
#Create table
cur.execute('''
CREATE TABLE "kundeavgang"(
"customerID" TEXT,
"gender" TEXT,
"SeniorCitizen" TEXT,
"Partner" TEXT,
"Dependents" TEXT,
"tenure" INT,
"PhoneService" TEXT,
"MultipleLines" TEXT,
"InternetService" TEXT,
"OnlineSecurity" TEXT,
"DeviceProtection" TEXT,
"TechSupport" TEXT,
"StreamingMovies" TEXT,
"Contract" TEXT,
"PaperlessBilling" TEXT,
"PaymentMethod" TEXT,
"MonthlyCharges" FLOAT,
"TotalCharges" FLOAT,
"Churn" TEXT
)
''')
#Acsess .csv file
with open('kundeavgang.csv') as csvFile:
reader = csv.reader(csvFile)
skipHeader = next(reader) #Account for header
for row in reader:
customerID = row[0]
gender = row[1]
SeniorCitizen = row[2]
Partner = row[3]
Dependents = row[4]
tenure = row[5]
PhoneService = row[6]
MultipleLines = row[7]
InternetService = row[8]
OnlineSecurity = row[9]
OnlineBackup = row[10]
DeviceProtection = row[11]
TechSupport = row[12]
StreamingTV = [13]
StreamingMovies = row[14]
Contract = row[15]
PaperlessBilling = row[16]
PaymentMethod = row[17]
MonthlyCharges = row[18]
TotalCharges = row[19]
Churn = row[20]
cur.execute('''INSERT INTO kundeavgang(customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''',(customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn))
#Commit the transaction
con.commit()
#End connection
con.close()
In pgAdmin, the table comes up as existing in the database. However, I cannot find the actual table. Further, I have no idea about this line of code:
cur.execute('''INSERT INTO kundeavgang(customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''',(customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn))
What does all the %s stand for? I found it off an online example which was not very helpful, so I tried it without knowing what it means. I have seen some examples where question marks are inserted instead, but also this without explanation.
Lastly, as the code stands now, I get the error message:
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''',(customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn))
IndexError: tuple index out of range
All help or explanations will be appreciated.
For bulk inserts from text files, consider copy_from or copy_expert of psycopg2. Also, be sure to commit your execution:
cur.execute("DROP TABLE IF EXISTS kundeavgang")
con.commit()
cur.execute('''CREATE TABLE "kundeavgang" ... ''')
con.commit()
with open('kundeavgang.csv') as csvFile:
next(csvFile) # SKIP HEADERS
cur.copy_from(csvFile, "kundeavgang", sep=",")
# POSTGRES COPY COMMAND FOR CSV MODE
# cur.copy_expert("""COPY "kundeavgang" FROM STDIN WITH CSV""", csvFile)
con.commit()
The %s are placeholders for the values that will be inserted and passed through the following tuple:
(customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn)
The problem that your insert statement going to insert to 20 columns, you provide 20 values in your tuple but you have 22 placeholders (%s).
The problem is a mismatch between the number of columns to be populated and the length of the list provided. This is an easy mistake to make when dealing with a lot of columns. One way to reduce risk of error is to use the length of the columns or values list to build the statement.
cols = [name1, name2,...]
vals = [val1, val2, ...]
assert len(cols) == len(vals), 'mismatch between number of columns and number of values'
template = """INSERT INTO tbl ({}) VALUES ({})"""
stmt = template.format(', '.join(cols), ','.join(['%s'] * len(vals)))
cur.execute(stmt, vals)
Note that when building the column names dynamically it's good practice to quote them - psycopg2 provides tools for this.
Change the line reader = csv.reader(csvFile) to:
reader = csv.reader(csvFile, delimiter=';')

Create/Insert Json in Postgres with requests and psycopg2

Just started a project with PostgreSQL. I would like to make the leap from Excel to a database and I am stuck on create and insert. Once I run this I will have to switch it to Update I believe so I don't continue to write over the current data. I know my connection is working but i get the following error.
My Error is: TypeError: not all arguments converted during string formatting
#!/usr/bin/env python
import requests
import psycopg2
conn = psycopg2.connect(database='NHL', user='postgres', password='postgres', host='localhost', port='5432')
req = requests.get('http://www.nhl.com/stats/rest/skaters?isAggregate=false&reportType=basic&isGame=false&reportName=skatersummary&sort=[{%22property%22:%22playerName%22,%22direction%22:%22ASC%22},{%22property%22:%22goals%22,%22direction%22:%22DESC%22},{%22property%22:%22assists%22,%22direction%22:%22DESC%22}]&cayenneExp=gameTypeId=2%20and%20seasonId%3E=20172018%20and%20seasonId%3C=20172018')
data = req.json()['data']
my_data = []
for item in data:
season = item['seasonId']
player = item['playerName']
first_name = item['playerFirstName']
last_Name = item['playerLastName']
playerId = item['playerId']
height = item['playerHeight']
pos = item['playerPositionCode']
handed = item['playerShootsCatches']
city = item['playerBirthCity']
country = item['playerBirthCountry']
state = item['playerBirthStateProvince']
dob = item['playerBirthDate']
draft_year = item['playerDraftYear']
draft_round = item['playerDraftRoundNo']
draft_overall = item['playerDraftOverallPickNo']
my_data.append([playerId, player, first_name, last_Name, height, pos, handed, city, country, state, dob, draft_year, draft_round, draft_overall, season])
cur = conn.cursor()
cur.execute("CREATE TABLE t_skaters (data json);")
cur.executemany("INSERT INTO t_skaters VALUES (%s)", (my_data,))
Sample of data:
[[8468493, 'Ron Hainsey', 'Ron', 'Hainsey', 75, 'D', 'L', 'Bolton', 'USA', 'CT', '1981-03-24', 2000, 1, 13, 20172018], [8471339, 'Ryan Callahan', 'Ryan', 'Callahan', 70, 'R', 'R', 'Rochester', 'USA', 'NY', '1985-03-21', 2004, 4, 127, 20172018]]
It seems like you want to create a table with one column named "data". The type of this column is JSON. (I would recommend creating one column per field, but it's up to you.)
In this case the variable data (that is read from the request) is a list of dicts. As I mentioned in my comment, you can loop over data and do the inserts one at a time as executemany() is not faster than multiple calls to execute().
What I did was the following:
Create a list of fields that you care about.
Loop over the elements of data
For each item in data, extract the fields into my_data
Call execute() and pass in json.dumps(my_data) (Converts my_data from a dict into a JSON-string)
Try this:
#!/usr/bin/env python
import requests
import psycopg2
import json
conn = psycopg2.connect(database='NHL', user='postgres', password='postgres', host='localhost', port='5432')
req = requests.get('http://www.nhl.com/stats/rest/skaters?isAggregate=false&reportType=basic&isGame=false&reportName=skatersummary&sort=[{%22property%22:%22playerName%22,%22direction%22:%22ASC%22},{%22property%22:%22goals%22,%22direction%22:%22DESC%22},{%22property%22:%22assists%22,%22direction%22:%22DESC%22}]&cayenneExp=gameTypeId=2%20and%20seasonId%3E=20172018%20and%20seasonId%3C=20172018')
# data here is a list of dicts
data = req.json()['data']
cur = conn.cursor()
# create a table with one column of type JSON
cur.execute("CREATE TABLE t_skaters (data json);")
fields = [
'seasonId',
'playerName',
'playerFirstName',
'playerLastName',
'playerId',
'playerHeight',
'playerPositionCode',
'playerShootsCatches',
'playerBirthCity',
'playerBirthCountry',
'playerBirthStateProvince',
'playerBirthDate',
'playerDraftYear',
'playerDraftRoundNo',
'playerDraftOverallPickNo'
]
for item in data:
my_data = {field: item[field] for field in fields}
cur.execute("INSERT INTO t_skaters VALUES (%s)", (json.dumps(my_data),))
# commit changes
conn.commit()
# Close the connection
conn.close()
I am not 100% sure if all of the postgres syntax is correct here (I don't have access to a PG database to test), but I believe that this logic should work for what you are trying to do.
Update For Separate Columns
You can modify your create statement to handle multiple columns, but it would require knowing the data type of each column. Here's some psuedocode you can follow:
# same boilerplate code from above
cur = conn.cursor()
# create a table with one column per field
cur.execute(
"""CREATE TABLE t_skaters (seasonId INTEGER, playerName VARCHAR, ...);"""
)
fields = [
'seasonId',
'playerName',
'playerFirstName',
'playerLastName',
'playerId',
'playerHeight',
'playerPositionCode',
'playerShootsCatches',
'playerBirthCity',
'playerBirthCountry',
'playerBirthStateProvince',
'playerBirthDate',
'playerDraftYear',
'playerDraftRoundNo',
'playerDraftOverallPickNo'
]
for item in data:
my_data = [item[field] for field in fields]
# need a placeholder (%s) for each variable
# refer to postgres docs on INSERT statement on how to specify order
cur.execute("INSERT INTO t_skaters VALUES (%s, %s, ...)", tuple(my_data))
# commit changes
conn.commit()
# Close the connection
conn.close()
Replace the ... with the appropriate values for your data.

Handling of binary data with Python + psycopg2 (DB: Postgres 9.5)

Lets say I have table test with columns id (of bigint type) & data (of bytea type).
I don't want to display actual binary data from column data when I execute below query in Python.
select * from test;
I just want a place holder which display <binary data> or <BLOB>, because some of data are in hundreds of MB in that column which does not make any sense to display binary data in column.
Is it possible to identify and replace binary data with place holder in psycopg2 ?
#!/usr/bin/python
import psycopg2
conn = psycopg2.connect(database = "testdb", user = "postgres", password = "pass123", host = "127.0.0.1", port = "5432")
print "Opened database successfully"
cur = conn.cursor()
cur.execute("SELECT id, data from test")
rows = cur.fetchall()
for row in rows:
print("ID = ", row[0])
print("DATA = ", row[1])
print "Operation done successfully";
conn.close()
We fetch the result from database and generate html report from the result, here user can provide any query in html textbox so query is not static, we execute that query and generate the html table. This is in-house report generation script.
If the data is bytea you can write your own bytea typecaster as an object that wraps a binary string.
Note that the data is fetched and sent on the network anyway. If you don't want that overhead just don't select those fields.
>>> import psycopg2
>>> class Wrapper:
... def __init__(self, thing):
... self.thing = thing
...
>>> psycopg2.extensions.register_type(
... psycopg2.extensions.new_type(
... psycopg2.BINARY.values, "WRAPPER", lambda x, cur: Wrapper(x)))
>>> cnn = psycopg2.connect('')
>>> cur = cnn.cursor()
>>> cur.execute("create table btest(id serial primary key, data bytea)")
>>> cur.execute("insert into btest (data) values ('foobar')")
>>> cur.execute("select * from btest")
>>> r = cur.fetchone()
>>> r
(1, <__main__.Wrapper instance at 0x7fb8740eba70>)
>>> r[1].thing
'\\x666f6f626172'
Please refer to the documentation for the extensions functions used.

I need to export a sql table into a pipe delimited text file including the column names as well using python

I am currently using the below and it imports the data which is comma seperated without the column names.
Also, there is a problem with the dates in the table, as it returns all the dates columns as "datetime.date(1925, 1, 24)", where the actual date in the table is like "1925-01-24".
import MySQLdb
conn = MySQLdb.connect(host = 'localhost', user = 'username', passwd = 'password', port = port, db = 'DBNAME')
cursor = conn.cursor()
query = """SELECT * FROM myschema.mytable;"""
cursor.execute(query)
FILE = cursor.fetchall()
with open('FILE.txt', 'w') as f:
for row in cursor:
f.write("%s\n" % str(row))
Why use python at all?
SELECT 'column name1', 'column name2'
UNION ALL
SELECT * FROM myschema.mytable INTO OUTFILE '/tmp/mytable.csv' FIELDS TERMINATED BY '|';
Just type this in the mysql console. Note if you get an error in this query saying something about secure file privilages do
SHOW VARIABLES LIKE "secure_file_priv";
And then use the specified location instead of '/tmp' in the previous query.

Python and sqlite- load tweets

Working/learning on loading a set of tweets into a sqlite db in Python 2.7 to then do some queries. But can't seem to get the data to read in.
I can create the db, create the table, but when I go to read the tweets, parse and load- it (seems) it is not populating the table.
Of the many possible fields in each tweet, I am only trying to capture six.
My code is:
#create database works fine
import sqlite3
conn = sqlite3.connect('twitter.db')
c = conn.cursor()
import urllib2
tweets=urllib2.urlopen("http://rasinsrv07.cstcis.cti.depaul.edu/CSC455/assignment4.txt")
tweets.readline()
#create table: works fine
c.execute("CREATE TABLE Tweet(created_at, id, text, source, in_reply_to_user_ID,retweet_Count)")
#Loads variables & data in table: Not loading. I think this is my problem here.
for elt in tweets:
currentRow = elt[:-1].split(", ")
insert = """insert into Tweet values ('%s', '%s', '%s', '%s', '%s', '%s')""" % ("created_at", "id", "text", 'source', 'in_reply_to_user_ID', 'retweet_Count')
print insert
conn.commit()
#tried to see the table
read_tweets = """SELECT * from Tweet """
c.fetchall()
#tried again to see the table. Must be no data in the table
read_tweets = """SELECT * from Tweet """
rows = c.fetchall()
for row in rows:
print row

Categories

Resources