connect a json file data to mysql database (empty table) - python

im a beginner in python , Trying to connect access database to python with a json file loaded in my program so I can read it and eventually analyze it for certain things. But I can't connect to it and tried different approaches still getting the same error.
import mysql.connector
import json
# create the key
from mysql.connector import cursor
mydb = mysql.connector.connect(host='localhost', port='3306', user='root', password='nihad147', database='tweets')
mycursor = mydb.cursor()
sql_tweet = """INSERT INTO tweet ( tweet_id,
id_user,
text,
tweet_location,
created_at,
name_screen,
categorie_id,
)
VALUES (%s,%s,%s,%s,%s,%s,%s)"""
sql_user = """INSERT INTO tweetuser (
id_user,
name_screen,
location_user,
count_followers,
friends_count,
statuse_count)
VALUES (%s,%s,%s,%s,%s,%s)"""
sql_location = """"insert into tweet_location (
location_id,
latitude,
longitude
tweet_id
VALUES(%s,%s,%s,%s)"""
myJsonFile = open('tweets.json', encoding="utf-8")
mycursor.execute("DELETE FROM tweet")
mycursor.execute("DELETE FROM tweetuser")
mycursor.execute("DELETE FROM tweet_location")
c = 0
for line in myJsonFile:
c = c + 1
print("tweet number ", c, " is uploading to the server")
data = json.loads(line)
# insert into tweet
val_tweet = (
data['tweet_id'], data['user_id_str'], data['raw_text'],data['location']['address']['city'],data['date'], data['user_screen_name'])
mycursor.execute(sql_tweet,sql_location, val_tweet)
mydb.commit()
# testing ifthe user already exist
user = "SELECT * FROM tweetuser WHERE id_user = '" + str(data['user_id_str']) + "'"
mycursor.execute(user)
myresult = mycursor.fetchall()
row_count = mycursor.rowcount
if row_count == 0:
val_user = (data['user_id_str'], data['user_screen_name'], data['location']['address']['city'],data['user_followers_count'],
data['user_friends_count'], data['user_statuses_count'])
mycursor.execute(sql_user, val_user)
mydb.commit()
print('done')
here's an example of json file data :
{
"tweet_id":"1261276320878788609",
"date":"Fri May 15 12:44:42 +0000 2020",
"raw_text":"برنامج وطني لدعم المبدعين في مواجهة #كورون",
"geo_source":"user_location",
"location":{
"address":{
"country":"Tunisia",
"country_code":"tn",
"state_district":"غزالة",
"county":"العرب",
"state":"Bizerte"
},
"response":"{'place_id': 235309103, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 7124228, 'boundingbox': ['37.105957', '37.2033466', '9.4739053', '9.6124953'], 'lat': '37.1551868', 'lon': '9.54834183807249', 'display_name': 'العرب, غزالة, Bizerte, Tunisia', 'class': 'boundary', 'type': 'administrative', 'importance': 0.45, 'icon': '/data/nominatimimages/mapicons/poi_boundary_administrative.p.20.png','address':{'county': 'العرب', 'state_district': 'غزالة', 'state': 'Bizerte', 'country': 'Tunisia', 'country_code': 'tn'}}",
"geohash":"snwg37buskzd",
"query_term":"arab",
"lon":9.54834183807249,
"lat":37.1551868
},
"user_friends_count":61,
"user_description":"I love UAE and his great leadership",
"user_created_at":"Wed Oct 09 11:41:41 +0000 2013",
"user_screen_name":"SikandarMirani",
"user_id_str":"706377881",
"user_verified":false,
"user_statuses_count":50804,
"user_followers_count":946,
"user_location":"Dubai United Arab Emirates"
}
thanks to you guys , i was able to solve the previous error since i didn't check tha data type of the id user it has to be bigint not int since it's a large data .
i had no problem connecting my jsonfile to my database but it got inserted only in tweetuser table but not in tweet table .
the tweet table is empty.
i would appreciate any kind of help thank you

The error
mysql.connector.errors.DataError: 1264 (22003): Out of range value for column 'id_user' at row 1
suggests that the value you are trying to use as the id_user is numerically too large.
Since you haven't posted the table definitions, my guess is you are using MEDIUMINT or SMALLINT or TINYINT for id_user and the actual user ID that you are trying to write into the database is too large for that data type.
In your example user_id_str is 706377881, however, the maximum value for MEDIUMINT is 8388607 and 16777215 (unsigned), respectively.
Check the data types in the table definitions.

You are connecting to your DB, that is not the problem.
The problem is that the user id that you are trying to insert has a length that surpasses the maximum allowed by MySQL for the datatype of that field. See here and here for more info related to your error.

Related

add API currency convert to postres table in python

I have an exercise to do:
Through library psycopg2 connect with my database and get top 10 customers along with their credit limit:
import psycopg2
connection = psycopg2.connect(
host='host',
user='user',
password='password',
dbname='db_name',
)
cursor = connection.cursor()
cursor.execute('SELECT customername, creditlimit FROM customers order by 2 desc limit 10')
for row in cursor:
row[0]
row[1]
print(f"{row[0]}: {row[1]} USD")
This is the result of the query above:
As per next step I need to connect with API and convert this creditlimit to different currencies (EUR, JPY, GBP), so the final output will be like that:
Euro+ Shopping Channel: 227600.00 USD / 187183.93EUR / 24017262.40 JPY /160578.63 GBP
...
This is the code for API (the key is valid, feel free to test):
headers = {
"apikey": "b6iGXz0JZcr2iyV0SeLcd0dEpqOr8ngK"
}
currency = ['EUR', 'JPY', 'GBP']
params = {
"from": "USD",
"to": 'EUR',
"amount": row[1]
}
url = "https://api.apilayer.com/exchangerates_data/convert"
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
data = response.json()
I connected with API, but I have difficulties to append it to my sql data, please advise.

Problems inserting a new entry in Astra Cassandra

I am doing a migration from Cassandra on an AWS machine to Astra Cassandra and there are some problems :
I cannot insert new data in Astra Cassandra with a column which is around 2 million characters and 1.77 MB (and I have bigger data to insert - around 20 millions characters). Any one knows how to address the problem?
I am inserting it via a Python app (cassandra-driver==3.17.0) and this is the error stack I get :
start.sh[5625]: [2022-07-12 15:14:39,336]
INFO in db_ops: error = Error from server: code=1500
[Replica(s) failed to execute write]
message="Operation failed - received 0 responses and 2 failures: UNKNOWN from 0.0.0.125:7000, UNKNOWN from 0.0.0.181:7000"
info={'consistency': 'LOCAL_QUORUM', 'required_responses': 2, 'received_responses': 0, 'failures': 2}
If I used half of those characters it works.
new Astra Cassandra CQL Console table description :
token#cqlsh> describe mykeyspace.series;
CREATE TABLE mykeyspace.series (
type text,
name text,
as_of timestamp,
data text,
hash text,
PRIMARY KEY ((type, name, as_of))
) WITH additional_write_policy = '99PERCENTILE'
AND bloom_filter_fp_chance = 0.01
AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
AND comment = ''
AND compaction = {'class': 'org.apache.cassandra.db.compaction.UnifiedCompactionStrategy'}
AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
AND crc_check_chance = 1.0
AND default_time_to_live = 0
AND gc_grace_seconds = 864000
AND max_index_interval = 2048
AND memtable_flush_period_in_ms = 0
AND min_index_interval = 128
AND read_repair = 'BLOCKING'
AND speculative_retry = '99PERCENTILE';
Old Cassandra table description:
ansible#cqlsh> describe mykeyspace.series;
CREATE TABLE mykeyspace.series (
type text,
name text,
as_of timestamp,
data text,
hash text,
PRIMARY KEY ((type, name, as_of))
) WITH bloom_filter_fp_chance = 0.01
AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
AND comment = ''
AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}
AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
AND crc_check_chance = 1.0
AND dclocal_read_repair_chance = 0.1
AND default_time_to_live = 0
AND gc_grace_seconds = 864000
AND max_index_interval = 2048
AND memtable_flush_period_in_ms = 0
AND min_index_interval = 128
AND read_repair_chance = 0.0
AND speculative_retry = '99PERCENTILE';
Data sample :
{"type": "OP", "name": "book", "as_of": "2022-03-17", "data": [{"year": 2022, "month": 3, "day": 17, "hour": 0, "quarter": 1, "week": 11, "wk_year": 2022, "is_peak": 0, "value": 1.28056854009628e-08}, .... ], "hash": "84421b8d934b06488e1ac464bd46e83ccd2beea5eb2f9f2c52428b706a9b2a10"}
where this json contains 27.000 entries inside the data array like :
{"year": 2022, "month": 3, "day": 17, "hour": 0, "quarter": 1, "week": 11, "wk_year": 2022, "is_peak": 0, "value": 1.28056854009628e-08}
Python part of the code :
def insert_to_table(self, table_name, **kwargs):
try:
...
elif table_name == "series":
self.session.execute(
self.session.prepare("INSERT INTO series (type, name, as_of, data, hash) VALUES (?, ?, ?, ?, ?)"),
(
kwargs["type"],
kwargs["name"],
kwargs["as_of"],
kwargs["data"],
kwargs["hash"],
),
)
return True
except Exception as error:
current_app.logger.error('src/db/db_ops.py insert_to_table() table_name = %s error = %s', table_name, error)
return False
Big thanks !
you are hitting the configured limit for the maximum mutation size. On Cassandra, this defaults to 16 MB, while on Astra DB at the moment it is 4 MB (it's possible that it will be raised, but performing inserts with veyr large cell sizes is still strongly discouraged).
A more agile approach to storing this data would be to revise your data model and split the big row with the huge string into several rows, each containing a single item of the 27000 or so entries. With proper use of partitioning, you would still be able to retrieve the whole contents with a single query (paginated between the database and the driver for your convenience, which would help avoiding annoying timeouts which may arise when reading so large individual rows).
Incidentally, I suggest you create the prepared statement only once outside of the insert_to_table function (caching it or something). In the insert function you simply self.session.execute(already_prepared_statement, (value1, value2, ...)) which would noticeably improve your performance.
A last point: I believe the drivers are able to connect to Astra DB only starting from version 3.24.0, so I'm not sure how you are using version 3.17. I don't think version 3.17 know of the cloud argument to the Cluster constructor. In any case, I suggest you upgrade the drivers to the latest version (currently 3.25.0).
There's something not quite right with the details you posted in your question.
In the schema you posted, the data column is of type text:
data text,
But the sample data you posted looks like you are inserting key/value pairs which oddly seems to be formatted like a CQL collection type.
If it were really a string, it would be formatted as:
... "data": "key1: value1, key2: value2, ...", ...
Review your data and code then try again. Cheers!
Finally, the short solution to make a data migration possible for an inhouse Cassandra DB to Astra Cassandra (DataStax) was to use compression (zlib).
So the "data" field of each entry was compress in Python with zlib and then stored in Cassandra to reduce the size of the entries.
def insert_to_table(self, table_name, **kwargs):
try:
if table_name == 'series_as_of':
...
elif table_name == 'series':
list_of_bytes = bytes(json.dumps(kwargs["data"]),'latin1')
compressed_data = zlib.compress(list_of_bytes)
a = str(compressed_data, 'latin1')
self.session.execute(
self.session.prepare(INSERT_SERIES_QUERY),
(
kwargs["type"],
kwargs["name"],
kwargs["as_of"],
a,
kwargs["hash"],
),
)
....
Then, when reading the entries, a decompression step is needed :
def get_series(self, query_type, **kwargs):
try:
if query_type == "data":
execution = self.session.execute_async(
self.session.prepare(GET_SERIES_DATA_QUERY),
(kwargs["type"], kwargs["name"], kwargs["as_of"]),
timeout=30,
)
decompressed = None
a = None
for row in execution.result():
my_bytes = row[0].encode('latin1')
decompressed = zlib.decompress(my_bytes)
a = str(decompressed, encoding="latin1")
return a
...
The data looks like this now in Astra Cassandra :
token#cqlsh> select * from series where type = 'OP' and name = 'ZTP_PFM_H_LUX_PHY' and as_of = '2022-09-30';
type | name | as_of | data | hash
------+-------------------+---------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------
OP | ZTP_PFM_H_LUX_PHY | 2022-09-30 00:00:00.000000+0000 | x\x9c-\x8dÉ\nÂ#\x10D\x7fEúl`z\x16Mü\x80\x90\x83â\x1c\x14\F\x86îYÈÅ\x05\x92\x8b\x88ÿnÂx*ªÞ\x83\x82\x8f\x83ñýJ\x0e6\x0b\x07{ë`9å\x83îÿår°Þ¶;ßùíñämw.\x02\rþ\x99\x8b!\x85\x94\x95h*%\n\x8a4ÒL®·¹õ4ôÅÓÙ¨\x10\të \x99H\x04¡\x8cf6¹!\x95\x02'\x93"Jb\x1dë\x84ÈT¯U\x90\x19\x11W8\x1dp£\x8d\x83/ü\x00Ó<0\x99 | 4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba873c2f11161202b945
(1 rows)
Long term solution is to rearrange the data like #Stefano Lottini said.

insert data into mysql table using python

im a beginner in python language , and im trying to insert my json file data into my database table using python but the problem is i have no errors i just get :
tweet number 49634 is uploading to the server
i don't get where the problem is, please i would appreciate any help
import mysql.connector
import json
mydb = mysql.connector.connect(host='localhost', port='3306', user='root', password='nihad147', database='tweets')
mycursor = mydb.cursor()
sql_request='insert ignore into tweet_location (latitude, longitude, tweet_id) values (%s,%s,%s)'""
myJsonFile = open('tweet.json', encoding="utf-8")
c = 0
for line in myJsonFile:
c = c + 1
print("tweet number ", c, " is uploading to the server")
data = json.loads(line)
#line = line.replace('','')
tweet = "SELECT * FROM tweet WHERE tweet_id = '" + str(data['tweet_id']) + "'"
mycursor.execute(tweet)
myresult = mycursor.fetchall()
row_count = mycursor.rowcount
if row_count == 0:
if 'location' in data.keys() and data['location'] != None and 'address' in data['location']:
latitude = data ['location']['lat']
longitude=data ['location']['lon']
mycursor.execute(sql_request, (latitude,longitude,data['tweet_id']))
print ('------------')
mydb.commit ()
here's an example of my json file data:
{"tweet_id":"1261276320878788609",
"date":"Fri May 15 12:44:42 +0000 2020",
"raw_text":"برنامج وطني لدعم المبدعين في مواجهة #كورو"
"geo_source":"user_location",
"location":{"address":
{"country":"Tunisia","country_code":"tn","state_district":"غزالة","county":"العرب","state":"Bizerte"},
"response":
"{'place_id': 235309103, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 7124228, 'boundingbox': ['37.105957', '37.2033466', '9.4739053', '9.6124953'], 'lat': '37.1551868', 'lon': '9.54834183807249', 'display_name': 'العرب, غزالة, Bizerte, Tunisia', 'class': 'boundary', 'type': 'administrative', 'importance': 0.45, 'icon': '/data/nominatimimages/mapicons/poi_boundary_administrative.p.20.png','address':{'county': 'العرب', 'state_district': 'غزالة', 'state': 'Bizerte', 'country': 'Tunisia', 'country_code': 'tn'}}",
"geohash":"snwg37buskzd","query_term":"arab","lon":9.54834183807249,"lat":37.1551868},
"user_friends_count":61,"user_description":"I love UAE and his great leadership",
"user_created_at":"Wed Oct 09 11:41:41 +0000 2013",
"user_screen_name":"SikandarMirani",
"user_id_str":"706377881",
"user_verified":false,
"user_statuses_count":50804,
"user_followers_count":946,
"user_location":"Dubai United Arab Emirates"}
and this my table's attributes tweet:
tweet_id,
id_user,
text,
tweet_location,
created_at,
name_screen,
categorie_id,
Don't read the JSON file one line at a time. Use json.load() to load the entire file into a dictionary.
Use a parameter in the query that selects the tweet, rather that concatenating data['tweet_id'] into the SQL.
The code that inserts the new row should be inside all the if statements that set latitude and longitude from the data. In fact, you might as well put all the database code inside the if statement that checks whether location is set in the JSON.
import mysql.connector
import json
mydb = mysql.connector.connect(host='localhost', port='3306', user='root', password='nihad147', database='tweets')
mycursor = mydb.cursor()
sql_request='insert ignore into tweet_location (latitude, longitude, tweet_id) values (%s,%s,%s)'""
with open('tweet.json', encoding="utf-8") as myJsonFile:
data = json.load(myJsonFile)
if data.get('location') and 'address' in data['location']:
tweet = "SELECT 1 FROM tweet WHERE tweet_id = %s"
mycursor.execute(tweet, (data['tweet_id'],))
myresult = mycursor.fetchall()
row_count = len(myresult)
if row_count == 0:
print(f"Inserting {data['tweet_id']} to the database");
latitude = data['location']['lat']
longitude = data['location']['lon']
mycursor.execute(sql_request, (latitude,longitude,data['tweet_id']))
mydb.commit ()
print ('------------')
else:
print(f"Tweet {data['tweet_id']} is already in the database")

Psycopg2 - Inserting complex query with strings + numbers? [duplicate]

I have a tuple as below
data = ({'weather station name': 'Substation', 'wind': '6 km/h', 'barometer': '1010.3hPa', 'humidity': '42%', 'temperature': '34.5 C', 'place_id': '001D0A00B36E', 'date': '2016-05-10 09:48:58'})
I am trying to push the values from the above tuple to the postgres table using the code below:
try:
con = psycopg2.connect("dbname='WeatherForecast' user='postgres' host='localhost' password='****'")
cur = con.cursor()
cur.executemany("""INSERT INTO weather_data(temperature,humidity,wind,barometer,updated_on,place_id) VALUES (%(temperature)f, %(humidity)f, %(wind)f, %(barometer)f, %(date)s, %(place_id)d)""", final_weather_data)
ver = cur.fetchone()
print(ver)
except psycopg2.DatabaseError as e:
print('Error {}'.format(e))
sys.exit(1)
finally:
if con:
con.close()
Where datatype of each field in the DB is as below:
id serial NOT NULL,
temperature double precision NOT NULL,
humidity double precision NOT NULL,
wind double precision NOT NULL,
barometer double precision NOT NULL,
updated_on timestamp with time zone NOT NULL,
place_id integer NOT NULL,
When i run the code to push the data into postgres table using psycopg2, it is raising an error "ValueError: unsupported format character 'f'"
I hope the issue is in formatting. Am using Python3.4
Have a look at the documentation:
The variables placeholder must always be a %s, even if a different placeholder (such as a %d for integers or %f for floats) may look more appropriate:
>>> cur.execute("INSERT INTO numbers VALUES (%d)", (42,)) # WRONG
>>> cur.execute("INSERT INTO numbers VALUES (%s)", (42,)) # correct
While, your SQL query contains all type of placeholders:
"""INSERT INTO weather_data(temperature,humidity,wind,barometer,updated_on,place_id)
VALUES (%(temperature)f, %(humidity)f, %(wind)f, %(barometer)f, %(date)s, %(place_id)d)"""

What are the possible ways for JSON data processing using SQL, elastic search or preprocessing using python

I have a case study where i need to take data from a REST API do some analysis on the data using aggregate function,joins etc and use the response data in JSON format to plot some retail grahs.
Approaches being followed till now:
Read the data from JSON store these in python variable and use insert to hit the SQL query. Obviously it is a costly operation because for every JSON line read it is inserting into database.For 33k rows it is taking more than 20 mins which is inefficient.
This can be handled in elastic search for faster processing but complex operation like joins are not present in elastic search.
If anybody can suggest what would be the best approach (like preprocessing or post processing in python) to follow for handling such scenerios it would be helpful.
Thanks in advance
Sql Sript
def store_data(AccountNo)
db=MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DATABASE, charset="utf8")
cursor = db.cursor()
insert_query = "INSERT INTO cstore (AccountNo) VALUES (%s)"
cursor.execute(insert_query, (AccountNo))
db.commit()
cursor.close()
db.close()
return
def on_data(file_path):
#This is the meat of the script...it connects to your mongoDB and stores the tweet
try:
# Decode the JSON from Twitter
testFile = open(file_path)
datajson = json.load(testFile)
#print (len(datajson))
#grab the wanted data from the Tweet
for i in range(len(datajson)):
for cosponsor in datajson[i]:
AccountNo=cosponsor['AccountNo']
store_data( AccountNo)
Edit1: Json Added
{
"StartDate": "1/1/18",
"EndDate": "3/30/18",
"Transactions": [
{
"CSPAccountNo": "41469300",
"ZIP": "60098",
"ReportDate": "2018-03-08T00:00:00",
"POSCode": "00980030003",
"POSCodeModifier": "0",
"Description": "TIC TAC GUM WATERMEL",
"ActualSalesPrice": 1.59,
"TotalCount": 1,
"Totalsales": 1.59,
"DiscountAmount": 0,
"DiscountCount": 0,
"PromotionAmount": 0,
"PromotionCount": 0,
"RefundAmount": 0,
"RefundCount": 0
},
{
"CSPAccountNo": "41469378",
"ZIP": "60098",
"ReportDate": "2018-03-08T00:00:00",
"POSCode": "01070080727",
"POSCodeModifier": "0",
"Description": "PAYDAY KS",
"ActualSalesPrice": 2.09,
"TotalCount": 1,
"Totalsales": 2.09,
"DiscountAmount": 0,
"DiscountCount": 0,
"PromotionAmount": 0,
"PromotionCount": 0,
"RefundAmount": 0,
"RefundCount": 0
}
]
}
I do not have your json file so not know if it is runnable, but I would have tried something like below: I read just your account infos to a list and than try to write to the db at once with executemany I expect it to have a better(less) execution time than 20 mins.
def store_data(AccountNo):
db = MySQLdb.connect(host=HOST, user=USER, passwd=PASSWD, db=DATABASE, charset="utf8")
cursor = db.cursor()
insert_query = "INSERT INTO cstore (AccountNo,ZIP,ReportDate) VALUES (:AccountNo,:ZIP,:ReportDate)"
cursor.executemany(insert_query, AccountNo)
db.commit()
cursor.close()
db.close()
return
def on_data(file_path):
# This is the meat of the script...it connects to your mongoDB and stores the tweet
try:
#declare an empty list for the all accountno's
accountno_list = list()
# Decode the JSON from Twitter
testFile = open(file_path)
datajson = json.load(testFile)
# print (len(datajson))
# grab the wanted data from the Tweet
for row in datajson[0]['Transactions']:
values = dict()
values['AccountNo'] = row['CSPAccountNo']
values['ZIP'] = row['ZIP']
values['ReportDate'] = row['ReportDate']
#from here on you can populate the attributes you need in a similar way..
accountno_list.append(values)
except:
pass
store_data(accountno_list)

Categories

Resources