Save dataframe in Postgresql Database with SERIAL Autogenerated ID - python

Having a dataframe in the following way:
word classification counter
0 house noun 2
1 the article 2
2 white adjective 1
3 yellow adjective 1
I would like to store in Postgresql table with the following definition:
CREATE TABLE public.word_classification (
id SERIAL,
word character varying(100),
classification character varying(10),
counter integer,
start_date date,
end_date date
);
ALTER TABLE public.word_classification OWNER TO postgres;
The current basic configuration I have is as follows:
from sqlalchemy import create_engine
import pandas as pd
# Postgres username, password, and database name
POSTGRES_ADDRESS = 'localhost' ## INSERT YOUR DB ADDRESS IF IT'S NOT ON PANOPLY
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'postgres' ## CHANGE THIS TO YOUR PANOPLY/POSTGRES USERNAME
POSTGRES_PASSWORD = 'BVict31C' ## CHANGE THIS TO YOUR PANOPLY/POSTGRES PASSWORD
POSTGRES_DBNAME = 'local-sandbox-dev' ## CHANGE THIS TO YOUR DATABASE NAME
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}#{ipaddress}:{port}/{dbname}'.format(username=POSTGRES_USERNAME,password=POSTGRES_PASSWORD,ipaddress=POSTGRES_ADDRESS,port=POSTGRES_PORT,dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)
data=[['the','article',0],['house','noun',1],['yellow','adjective',2],
['the','article',4],['house','noun',5],['white','adjective',6]]
df = pd.DataFrame(data, columns=['word','classification','position'])
df_db = pd.DataFrame(columns=['word','classification','counter','start_date','end_date'])
count_series=df.groupby(['word','classification']).size()
new_df = count_series.to_frame(name = 'counter').reset_index()
df_db = new_df.to_sql('word_classification',cnx,if_exists='append',chunksize=1000)
I would like to insert into the table as I am able to do with SQL syntax:
insert into word_classification(word, classification, counter)values('hello','world',1);
Currently, I am getting an error when inserting into the table because I am passing the index:
(psycopg2.errors.UndefinedColumn) column "index" of relation "word_classification" does not exist
LINE 1: INSERT INTO word_classification (index, word, classification...
^
[SQL: INSERT INTO word_classification (index, word, classification, counter) VALUES (%(index)s, %(word)s, %(classification)s, %(counter)s)]
[parameters: ({'index': 0, 'word': 'house', 'classification': 'noun', 'counter': 2}, {'index': 1, 'word': 'the', 'classification': 'article', 'counter': 2}, {'index': 2, 'word': 'white', 'classification': 'adjective', 'counter': 1}, {'index': 3, 'word': 'yellow', 'classification': 'adjective', 'counter': 1})]
I have been searching for ways to get rid of passing the index with no luck.
Thanks for your help

Turn off index when storing in database as follows:
df_db = new_df.to_sql('word_classification',cnx,if_exists='append',chunksize=1000, index=False)

Related

connect a json file data to mysql database (empty table)

im a beginner in python , Trying to connect access database to python with a json file loaded in my program so I can read it and eventually analyze it for certain things. But I can't connect to it and tried different approaches still getting the same error.
import mysql.connector
import json
# create the key
from mysql.connector import cursor
mydb = mysql.connector.connect(host='localhost', port='3306', user='root', password='nihad147', database='tweets')
mycursor = mydb.cursor()
sql_tweet = """INSERT INTO tweet ( tweet_id,
id_user,
text,
tweet_location,
created_at,
name_screen,
categorie_id,
)
VALUES (%s,%s,%s,%s,%s,%s,%s)"""
sql_user = """INSERT INTO tweetuser (
id_user,
name_screen,
location_user,
count_followers,
friends_count,
statuse_count)
VALUES (%s,%s,%s,%s,%s,%s)"""
sql_location = """"insert into tweet_location (
location_id,
latitude,
longitude
tweet_id
VALUES(%s,%s,%s,%s)"""
myJsonFile = open('tweets.json', encoding="utf-8")
mycursor.execute("DELETE FROM tweet")
mycursor.execute("DELETE FROM tweetuser")
mycursor.execute("DELETE FROM tweet_location")
c = 0
for line in myJsonFile:
c = c + 1
print("tweet number ", c, " is uploading to the server")
data = json.loads(line)
# insert into tweet
val_tweet = (
data['tweet_id'], data['user_id_str'], data['raw_text'],data['location']['address']['city'],data['date'], data['user_screen_name'])
mycursor.execute(sql_tweet,sql_location, val_tweet)
mydb.commit()
# testing ifthe user already exist
user = "SELECT * FROM tweetuser WHERE id_user = '" + str(data['user_id_str']) + "'"
mycursor.execute(user)
myresult = mycursor.fetchall()
row_count = mycursor.rowcount
if row_count == 0:
val_user = (data['user_id_str'], data['user_screen_name'], data['location']['address']['city'],data['user_followers_count'],
data['user_friends_count'], data['user_statuses_count'])
mycursor.execute(sql_user, val_user)
mydb.commit()
print('done')
here's an example of json file data :
{
"tweet_id":"1261276320878788609",
"date":"Fri May 15 12:44:42 +0000 2020",
"raw_text":"برنامج وطني لدعم المبدعين في مواجهة #كورون",
"geo_source":"user_location",
"location":{
"address":{
"country":"Tunisia",
"country_code":"tn",
"state_district":"غزالة",
"county":"العرب",
"state":"Bizerte"
},
"response":"{'place_id': 235309103, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 7124228, 'boundingbox': ['37.105957', '37.2033466', '9.4739053', '9.6124953'], 'lat': '37.1551868', 'lon': '9.54834183807249', 'display_name': 'العرب, غزالة, Bizerte, Tunisia', 'class': 'boundary', 'type': 'administrative', 'importance': 0.45, 'icon': '/data/nominatimimages/mapicons/poi_boundary_administrative.p.20.png','address':{'county': 'العرب', 'state_district': 'غزالة', 'state': 'Bizerte', 'country': 'Tunisia', 'country_code': 'tn'}}",
"geohash":"snwg37buskzd",
"query_term":"arab",
"lon":9.54834183807249,
"lat":37.1551868
},
"user_friends_count":61,
"user_description":"I love UAE and his great leadership",
"user_created_at":"Wed Oct 09 11:41:41 +0000 2013",
"user_screen_name":"SikandarMirani",
"user_id_str":"706377881",
"user_verified":false,
"user_statuses_count":50804,
"user_followers_count":946,
"user_location":"Dubai United Arab Emirates"
}
thanks to you guys , i was able to solve the previous error since i didn't check tha data type of the id user it has to be bigint not int since it's a large data .
i had no problem connecting my jsonfile to my database but it got inserted only in tweetuser table but not in tweet table .
the tweet table is empty.
i would appreciate any kind of help thank you
The error
mysql.connector.errors.DataError: 1264 (22003): Out of range value for column 'id_user' at row 1
suggests that the value you are trying to use as the id_user is numerically too large.
Since you haven't posted the table definitions, my guess is you are using MEDIUMINT or SMALLINT or TINYINT for id_user and the actual user ID that you are trying to write into the database is too large for that data type.
In your example user_id_str is 706377881, however, the maximum value for MEDIUMINT is 8388607 and 16777215 (unsigned), respectively.
Check the data types in the table definitions.
You are connecting to your DB, that is not the problem.
The problem is that the user id that you are trying to insert has a length that surpasses the maximum allowed by MySQL for the datatype of that field. See here and here for more info related to your error.

How to return field of record after update query with upsert as true in mongodb using pymongo?

I am trying to to insert record in mongodb but I dont want duplication so I am using update command with upsert=true
import pymongo
client = pymongo.MongoClient(settings.MONGO_DB_URI)
db = self.client[settings.MONGO_DB_NAME]
filter = {
'my_id': '1234',
'name': 'alok'
}
record = {
'my_id': '1234',
'name': 'alok',
'marks': 26
}
status = db['mycollection'].update(filter, {'$setOnInsert': record}, upsert=True)
print('id is ', status['my_id']) # this will not work but I want such behaviour
This code will insert record only if there is no existing record with matching filter values. So there are two case:
It will insert record
It will not insert record if already exist
In both the case I want to get my_id. How can I get my_id when update command execute?
You can search for the document and then print out its ID
print('id is ', db['mycollection'].find_one(filter)['my_id'])

Execute user-defined query on list of dictionaries

I have a set of data that a user needs to query using their own query string. The current solution creates a temporary in-memory sqlite database that the query is run against.
The dataset is a list of "flat" dictionaries, i.e. there is no nested data. The query string does not need to be SQL, but it should be simple to define using an existing query framework.
It needs to support ordering (ascending, descending, custom) and filtering.
The purpose of this question is to get a range of different solutions that might work for this use case.
import sqlite3
items = [
{'id': 1},
{'id': 2, 'description': 'This is a description'},
{'id': 3, 'comment': 'This is a comment'},
{'id': 4, 'height': 1.78}
]
# Assemble temporary sqlite database
conn = sqlite3.connect(':memory:')
cur = conn.cursor()
knownTypes = { "id": "real", "height": "real", "comment": "text" }
allKeys = list(set().union(*(d.keys() for d in items)))
allTypes = list(knownTypes.get(k, "text") for k in allKeys)
createTable_query = "CREATE TABLE data ({});".format(", ".join(["{} {}".format(x[0], x[1]) for x in zip(allKeys, allTypes)]))
cur.execute(createTable_query)
conn.commit()
qs = ["?" for i in range(len(allKeys))]
insertRow_query = "INSERT INTO data VALUES ({});".format(", ".join(qs))
for p in items:
vals = list([p.get(k, None) for k in allKeys])
cur.execute(insertRow_query, vals)
conn.commit()
# modify user query here
theUserQuery = "SELECT * FROM data"
# Get data from query
data = [row for row in cur.execute(theUserQuery)]
YAQL is what I'm looking for.
It doesn't do SQL, but it does execute a query string - which is a simple way to do complex user-defined sorting and filtering.
There's a library called litebox that does what you want. It is backed by SQLite.
from litebox import LiteBox
items = [
{'id': 1},
{'id': 2, 'description': 'This is a description'},
{'id': 3, 'comment': 'This is a comment'},
{'id': 4, 'height': 1.78}
]
types = {"id": int, "height": float, "comment": str}
lb = LiteBox(items, types)
lb.find("height > 1.5")
Result: [{'id': 4, 'height': 1.78}]

Printing the response of a RethinkDB query in a reasonable way

I am participating in the Yelp Dataset Challenge and I'm using RethinkDB to store the JSON documents for each of the different datasets.
I have the following script:
import rethinkdb as r
import json, os
RDB_HOST = os.environ.get('RDB_HOST') or 'localhost'
RDB_PORT = os.environ.get('RDB_PORT') or 28015
DB = 'test'
connection = r.connect(host=RDB_HOST, port=RDB_PORT, db=DB)
query = r.table('yelp_user').filter({"name":"Arthur"}).run(connection)
print(query)
But when I run it at the terminal in a virtualenv I get this as an example response:
<rethinkdb.net.DefaultCursor object at 0x102c22250> (streaming):
[{'yelping_since': '2014-03', 'votes': {'cool': 1, 'useful': 2, 'funny': 1}, 'review_count': 5, 'id': '08eb0b0d-2633-4ec4-93fe-817a496d4b52', 'user_id': 'ZuDUSyT4bE6sx-1MzYd2Kg', 'compliments': {}, 'friends': [], 'average_stars': 5, 'type': 'user', 'elite': [], 'name': 'Arthur', 'fans': 0}, ...]
I know I can use pprint to pretty print outputs but a bigger issue that I don't understand how to resolve is just printing them in an intelligent manner, like not just showing "..." as the end of the output.
Any suggestions?
run returns an iterable cursor. Iterate over it to get all the rows:
query = r.table('yelp_user').filter({"name":"Arthur"})
for row in query.run(connection):
print(row)

sqlalchemy: previous row and next row by id

I have a table Images with id and name. I want to query its previous image and next image in the database using sqlalchemy. How to do it in only one query?
sel = select([images.c.id, images.c.name]).where(images.c.id == id)
res = engine.connect().execute(sel)
#How to obtain its previous and next row?
...
Suppose it is possible that some rows have been deleted, i.e., the ids may not be continuous. For example,
Table: Images
------------
id | name
------------
1 | 'a.jpg'
2 | 'b.jpg'
4 | 'd.jpg'
------------
prev_image = your_session.query(Images).order_by(Images.id.desc()).filter(Images.id < id).first()
next_image = your_session.query(Images).order_by(Images.id.asc()).filter(Images.id > id).first()
# previous
prv = select([images.c.id, images.c.name]).where(images.c.id < id).order_by(images.c.id.desc()).limit(1)
res = engine.connect().execute(prv)
for res in res:
print(res.id, res.name)
# next
nxt = select([images.c.id, images.c.name]).where(images.c.id > id).order_by(images.c.id).limit(1)
res = engine.connect().execute(nxt)
for res in res:
print(res.id, res.name)
This can be accomplished in a "single" query by taking the UNION of two queries, one to select the previous and target records and one to select the next record (unless the backend is SQLite, which does not permit an ORDER BY before the final statement in a UNION):
import sqlalchemy as sa
...
with engine.connect() as conn:
target = 3
query1 = sa.select(tbl).where(tbl.c.id <= target).order_by(tbl.c.id.desc()).limit(2)
query2 = sa.select(tbl).where(tbl.c.id > target).order_by(tbl.c.id.asc()).limit(1)
res = conn.execute(query1.union(query2))
for row in res:
print(row)
producing
(2, 'b.jpg')
(3, 'c.jpg')
(4, 'd.jpg')
Note that we could make the second query the same as the first, apart from reversing the inequality
query2 = sa.select(tbl).where(tbl.c.id >= target).order_by(tbl.c.id.asc()).limit(2)
and we would get the same result as the union would remove the duplicate target row.
If the requirement were to find the surrounding rows for a selection of rows we could use the lag and lead window functions, if they are supported.
# Works in PostgreSQL, MariaDB and SQLite, at least.
with engine.connect() as conn:
query = sa.select(
tbl.c.id,
tbl.c.name,
sa.func.lag(tbl.c.name).over(order_by=tbl.c.id).label('prev'),
sa.func.lead(tbl.c.name).over(order_by=tbl.c.id).label('next'),
)
res = conn.execute(query)
for row in res:
print(row._mapping)
Output:
{'id': 1, 'name': 'a.jpg', 'prev': None, 'next': 'b.jpg'}
{'id': 2, 'name': 'b.jpg', 'prev': 'a.jpg', 'next': 'c.jpg'}
{'id': 3, 'name': 'c.jpg', 'prev': 'b.jpg', 'next': 'd.jpg'}
{'id': 4, 'name': 'd.jpg', 'prev': 'c.jpg', 'next': 'e.jpg'}
{'id': 5, 'name': 'e.jpg', 'prev': 'd.jpg', 'next': 'f.jpg'}
{'id': 6, 'name': 'f.jpg', 'prev': 'e.jpg', 'next': None}
To iterate through your records. I think that this is what you're looking for.
for row in res:
print row.id
print row.name

Categories

Resources