convert very big json to sql using Ijson - python

I have 60GB json file and i want to convert it to sql with ijson. ( i tried many software and tools , They weren't useful and my system was crashing.
Note : this is not duplicated ! i see all code about this methods and wrote this but still i have a very slow code .
here is my code :
import json
import sqlite3
import ijson
import threading
def solve():
with sqlite3.connect("filename.db") as conn:
try:
conn.execute('''
CREATE TABLE IF NOT EXISTS mytable(
username VARCHAR(225)
,phone INTEGER
,id INTEGER PRIMARY KEY
);''')
except :
pass
keys = ["username", "phone" , "id"]
with open('VERYBIGJSON.json' , 'r' , encoding='utf-8') as json_file:
data = ijson.parse(json_file , multiple_values=True)
for prefix, event, value in data:
if str(event) == 'start_map':
_u = None
if prefix == "id" :
_id = value
if prefix == "username":
_u = value
if prefix == "phone":
_p = value
try:
if str(event) == 'end_map' :
values = (_u , _p , _id)
cmd = """INSERT INTO mytable (username , phone , id) VALUES(
?,
?,
?
);"""
conn.execute(cmd, values)
conn.commit()
except Exception as e:
#print (str(e))
continue
if __name__ == '__main__':
t=[]
for i in range(1000):
t.append(threading.Thread(target=solve))
for i in t:
i.start()
for i in t:
i.join()
i tested both multi threading and multi procesing method and still my code run very very slowly ! (only 10KB of my sql database will generated per seconds ).
I want to do it very effeceintly.
my json sample is :
{"message":"{\"_\":\"user\",\"delete\":{\"test\":true},\"flags\":2067,\"id\":11111110,\"phone\":\"xxxxxxxxxx\",\"photo\":{\"_\":\"userProfilePhoto\",\"photo_id\":\"xxxxxxxxxx\",\"photo_small\":{\"_\":\"file\",\"dcs\":4,\"volume_id\":\"8788701\",\"local_id\":385526},\"photo\":{\"_\":\"file\",\"dcs\":4,\"local_id\":385528}},\"status\":{\"_\":\"userStat\",\"online\":1566173427}}","phone":"xxxxxxxxxx","#version":"1","id":11111110}
{"index": {"_type": "_doc", "_id": "-Lcy4m8BAObvGO9GAsFa"}}
....
Please give me an idea to improve the code speed.
UPDATE:
according to comments i wrote a code for converting my big json file to .csv but still it is so slow but faster than method 1!
here is code :
import json
import sqlite3
import ijson
from multiprocessing import Process
import csv
import pandas as pd
from pandas.io.json import json_normalize
from multiprocessing import Process
import threading
def solve():
with open('VERYBIGJSON.json' , 'r' , encoding='utf-8') as json_file:
data = ijson.parse(json_file , multiple_values=True )
for prefix, event, value in data:
if str(event) == 'start_map':
_u = None
if prefix == "id" :
_id = value
if prefix == "username":
_u = value
if prefix == "phone":
_p = value
if str(event) == 'end_map' :
values = [{'username':'{}'.format(_u) , 'id':'{}'.format(_id) , 'phone':'{}'.format(_p)}] #converting to json for add to 'json_normalize'
df = json_normalize(values)
df.to_csv('test.csv', index=False, mode='a' , encoding='utf-8' , header=False)
if __name__ == '__main__':
solve()
Also with JsonSlicer library:
def solve():
with open('VERYBIGJSON.json' , 'r' , encoding='utf-8') as json_file:
for key in JsonSlicer(json_file , (None , None) ) :
print (key)
if __name__ == '__main__':
solve()
i Get this error:
for key in JsonSlicer(json_file , (None , None) ) :
RuntimeError: YAJL error: parse error: trailing garbage
d_from":"telegram_contacts"} {"index": {"_type": "_doc", "_i
(right here) ------^
I think this library don't support my json file.

Related

Use query string template from config in bigquery result

Trying to use the dynamic query string from config file and format in the result section using pyodbc, where the query string is printing as static value. Expecting to print the DB result populate dynamically.
Code:
def bq_exec_sql(sql):
client = bigquery.Client(project='development')
return client.query(sql)
def generate_select(sql, templete_select):
job = bq_exec_sql(sql)
result = ''
print(templete_select)
try:
for row in job:
result += templete_select
print(result)
if __name__ == '__main__':
for source in dashboard_activity_list:
sql = config.get(source).source_select // from config file
templete_select =config.get(source).select_template // from config file
generate_select(sql, templete_select)
Actual Output:
select '"+row['table_id']+"' as table_id, '"+row['frequency']+"' as frequency from `trend-dev.test.table1`
sselect '"+row['table_id']+"' as table_id, '"+row['info']+"' as info from `trend-dev.test.table2`
Expected Output:
select table_name1 as table_id, daily from `trend-dev.test.table1`
select table_name2 as table_id, chart_data from `trend-dev.test.table2`
Config file:
dashboard_activity_list: [source_partner_trend, source_partner_normal]
source_partner_trend:
source_select : select * from `trend-dev.test.trend_partner`
source_key : trend_source_partner
distination_table : test.trend_partner_dashboard_feed
select_template : select '"+row['table_id']+"' as table_id, '"+row['frequency']+"' as frequency from `trend-dev.test.table1`
source_partner_normal:
source_select : select * from `trend-dev.test.normal_partner`
source_key : normal_source_partner
distination_table : test.normal_partner_dashboard_feed
select_template : select '"+row['table_id']+"' as table_id, '"+row['info']+"' as info from `trend-dev.test.table2`
I have replicated your code and it appears that its passing the whole query in templete_select variable as text string, I modified it so that It can separate the row from text string at the select statement and produced the expected output.
Working Snippet:
from google.cloud import bigquery
def bq_exec_sql(sql):
client = bigquery.Client(project='development')
return client.query(sql)
def generate_select(sql, templete_select):
job = bq_exec_sql(sql)
result = ''
try:
for row in job:
result += templete_select.format(row['table_id'])
print(result)
except:
print('error')
if __name__ == '__main__':
sql = 'select * from `<table_name>` limit 2'
templete_select =" select {} as table_id"
generate_select(sql, templete_select)
I found out one of best option to sort out the question, where use eval() method to assign dynamic string value formated to the resulted loop.
result += eval(f'f{templete_select!r}')
Ref : https://python-forum.io/thread-24481.html

TypeError: can only concatenate str (not "bytes") to str - decrypting from pandas

I'm trying to decrypt my encrypted data stored in a text file from my DropBox bucket. When I
create a pandas dataframe of the txt data and match it up with my user input, I exclusively pick the row that matches. Using this data I run it through my decryption (AES_CBC).
When I run I get the error: "TypeError: can only concatenate str (not "bytes") to str"
The error appears from my padding function (p4 = padded_text(df1))
I'm not sure if this is an encoding issue, or if the fact I'm using .get() as my user input and because that's a string it isn't working. Any help would be greatly appreciated.
def login():
import sqlite3
import os.path
def decoder():
from Crypto.Cipher import AES
import hashlib
from secrets import token_bytes
cursor.execute(
'''
Select enc_key FROM Login where ID = (?);
''',
(L_ID_entry.get(), ))
row = cursor.fetchone()
if row is not None:
keys = row[0]
import pyDes
#design padding function for encryption
def padded_text(data_in):
while len(data_in)% 16 != 0:
data_in = data_in + b"0"
return data_in
#calling stored key from main file and reverting back to bytes
key_original = bytes.fromhex(keys)
mode = AES.MODE_CBC
#model
cipher = AES.new(key_original, mode, IV3)
#padding data
p4 = padded_text(df1)
p5 = padded_text(df2)
p6 = padded_text(df3)
#decrypting data
d_fname = cipher.decrypt(p4)
d_sname = cipher.decrypt(p5)
d_email = cipher.decrypt(p6)
#connecting to db
try:
conn = sqlite3.connect('login_details.db')
cursor = conn.cursor()
print("Connected to SQLite")
except sqlite3.Error as error:
print("Failure, error: ", error)
finally:
#downloading txt from dropbox and converting to dataframe to operate on
import New_user
_, res = client.files_download("/user_details/enc_logins.txt")
with io.BytesIO(res.content) as csvfile:
df = pd.read_csv(csvfile, sep=';', names=['ID', 'Fname', 'Sname', 'Email'])
newdf = df.loc[df['ID'] == L_ID_entry.get()]
print(newdf)
df1 = newdf['Fname']
df2 = newdf['Sname']
df3 = newdf['Email']
csvfile.close()
decoder()

Decryption not working - how to get raw data from csv/pandas - python

Below is my code for decrypting from a csv file stored on DropBox. I get the user to type in their ID, I match this with a database containing hashed values, and then I use the ID typed in to search my stored csv file for the matching row. I then place all the row values into my decryption function.
Also im aware my variable names/formatting is awful im just using this code as a prototype as of right now.
My results are being printed as such:
b'b\xebS\x1b\xc8v\xe2\xf8\xa2\\x84\x0e7M~\x1b'
b'\x01B#6i\x1b\xfc]\xc3\xca{\xd5{B\xbe!'
b'in*V7\xf3P\xa0\xb2\xc5\xd2\xb7\x1dz~\x95'
I store my key and IV so they are always the same, yet the decryption doesnt seem to work. My only thinking is perhaps my data is changed somehow when stored in a csv or pandas table etc. does anyone know what the issue would be or if the bytes can be altered when stored/imported to dataframe?
also maybe i am extracting the data from my csv to pandas incorrectly?
def login():
import sqlite3
import os.path
def decoder():
from Crypto.Cipher import AES
import hashlib
from secrets import token_bytes
cursor.execute(
'''
Select enc_key FROM Login where ID = (?);
''',
(L_ID_entry.get(), ))
row = cursor.fetchone()
if row is not None:
keys = row[0]
#design padding function for encryption
def padded_text(data_in):
while len(data_in)% 16 != 0:
data_in = data_in + b"0"
return data_in
#calling stored key from main file and reverting back to bytes
key_original = bytes.fromhex(keys)
mode = AES.MODE_CBC
#model
cipher = AES.new(key_original, mode, IV3)
#padding data
p4 = padded_text(df1.tobytes())
p5 = padded_text(df2.tobytes())
p6 = padded_text(df3.tobytes())
#decrypting data
d_fname = cipher.decrypt(p4)
d_sname = cipher.decrypt(p5)
d_email = cipher.decrypt(p6)
print(d_fname)
print(d_sname)
print(d_email)
#connecting to db
try:
conn = sqlite3.connect('login_details.db')
cursor = conn.cursor()
print("Connected to SQLite")
except sqlite3.Error as error:
print("Failure, error: ", error)
finally:
#downloading txt from dropbox and converting to dataframe to operate on
import New_user
import ast
_, res = client.files_download("/user_details/enc_logins.csv")
with io.BytesIO(res.content) as csvfile:
with open("enc_logins.csv", 'rb'):
df = pd.read_csv(csvfile, names=['ID', 'Fname', 'Sname', 'Email'], encoding= 'unicode_escape')
newdf = df[(df == L_ID_entry.get()).any(axis=1)]
print(newdf)
df1 = newdf['Fname'].to_numpy()
df2 = newdf['Sname'].to_numpy()
df3 = newdf['Email'].to_numpy()
print(df1)
print(df2)
print(df3)
csvfile.close()
decoder()

psycopg2.errors.UndefinedTable: table "live" does not exist Pushing large dataframe to postgres database

Am trying to push a large dataframe to the Postgres db, am using df_to_sql for this, the problem is it takes forever to complete this task, I have up to 1millions of dataframe rows, so I decided to use the python multiprocessing.
below are my codes
def parallelize_push_to_db(df, table, chunk_size):
chunk = split_df(df, chunk_size=chunk_size)
chunk_len = len(chunk)
processes = list()
start_time = time.time()
for i in range(chunk_len):
process = Process(target=pussh_df_to_db, args=(chunk[i], table))
process.start()
processes.append(process)
for pro in processes:
pro.join()
end_time = time.time() - start_time
print(f'parra {end_time}')
def split_df(df, chunk_size):
chunks = list()
split_size = math.ceil(len(df) / chunk_size)
for i in range(split_size):
chunks.append(df[i * chunk_size:(i + 1) * chunk_size])
return chunks
def pussh_df_to_db(df, table):
# i think we should append the new df to the data in the db if not empty
base = DB_Base()
base.pg_cur.execute(f"DROP TABLE IF EXISTS {table}")
print(df.shape)
print('running...........to send dt to postgres')
df.to_sql(table, con=base.sql_alchemy_engine_conn, if_exists='replace')
if "__name__" == "__main__":
parallelize_push_to_db(check_data, 'live', 1000)
some part of the code run but fails at the later part with the below error
in do_execute
cursor.execute(statement, parameters)
sqlalchemy.exc.ProgrammingError: (psycopg2.errors.UndefinedTable) table "user_user_recommendations_AAa" does not exist
[SQL:
DROP TABLE "live"]
i think each process tries to create a new table and drop the old one, i really dont know how to solve this, any help??
Before i start, i will like to say, you will be better off using the threading modules, before making the decisions of using this sort of techniques you must understand the task you are doing. some tasks are cpu tasks whiles others are I/O heavy task. so try and understand all this before you choose which one you want, but sometimes they both will work.
def parallelize_push_to_db(df, table, chunk_size):
chunk = split_df(df, chunk_size=chunk_size)
chunk_len = len(chunk)
processes = list()
start_time = time.time()
for i in range(chunk_len):
process = Process(target=pussh_df_to_db, args=(chunk[i], table))
process.start()
processes.append(process)
for pro in processes:
pro.join()
end_time = time.time() - start_time
print(f'parra {end_time}')
the first Ans will also work, but a solution for your question is this..
Some alternative ways:
save the df to a csv file, and copy to the postgresql db.
refer: https://www.postgresql.org/docs/13/sql-copy.html
COPY table_name [ ( column_name [, ...] ) ]
FROM { 'filename' | PROGRAM 'command' | STDIN }
[ [ WITH ] ( option [, ...] ) ]
where option can be one of:
FORMAT format_name
OIDS [ boolean ]
FREEZE [ boolean ]
DELIMITER 'delimiter_character'
NULL 'null_string'
HEADER [ boolean ]
QUOTE 'quote_character'
ESCAPE 'escape_character'
FORCE_QUOTE { ( column_name [, ...] ) | * }
FORCE_NOT_NULL ( column_name [, ...] )
ENCODING 'encoding_name'
use a fast way to dump df to db.
refer: https://code.i-harness.com/en/q/16089da
import io
import csv
from sqlalchemy import create_engine
def df2db(df_a, table_name, engine):
output = io.StringIO()
# ignore the index
# df_a.to_csv(output, sep='\t', index = False, header = False, quoting=csv.QUOTE_NONE)
df_a.to_csv(output, sep='\t', index = False, header = False, quoting=csv.QUOTE_NONE, escapechar='\\')
output.getvalue()
# jump to start of stream
output.seek(0)
# engine -> from sqlalchemy import create_engine
connection = engine.raw_connection()
cursor = connection.cursor()
# null value become ''
cursor.copy_from(output,table_name,null='')
connection.commit()
cursor.close()
engine = create_engine('postgresql+psycopg2://username:passport#localhost:5432/databasname')
df2db(df, table_name, engine)

Fetching language detection from Google api

I have a CSV with keywords in one column and the number of impressions in a second column.
I'd like to provide the keywords in a url (while looping) and for the Google language api to return what type of language was the keyword in.
I have it working manually. If I enter (with the correct api key):
http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=merde
I get:
{"responseData": {"language":"fr","isReliable":false,"confidence":6.213709E-4}, "responseDetails": null, "responseStatus": 200}
which is correct, 'merde' is French.
so far I have this code but I keep getting server unreachable errors:
import time
import csv
from operator import itemgetter
import sys
import fileinput
import urllib2
import json
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
#not working
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
# Deserialize JSON to Python objects
result_object = json.loads(result)
#Get the rows in the table, then get the second column's value
# for each row
return row in result_object
#not working
def retrieve_terms(seedterm):
print(seedterm)
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=%(seed)s'
url = url_template % {"seed": seedterm}
try:
with urllib2.urlopen(url) as data:
data = perform_request(seedterm)
result = data.read()
except:
sys.stderr.write('%s\n' % 'Could not request data from server')
exit(E_OPERATION_ERROR)
#terms = parse_result(result)
#print terms
print result
def main(argv):
filename = argv[1]
csvfile = open(filename, 'r')
csvreader = csv.DictReader(csvfile)
rows = []
for row in csvreader:
rows.append(row)
sortedrows = sorted(rows, key=itemgetter('impressions'), reverse = True)
keys = sortedrows[0].keys()
for item in sortedrows:
retrieve_terms(item['keywords'])
try:
outputfile = open('Output_%s.csv' % (filename),'w')
except IOError:
print("The file is active in another program - close it first!")
sys.exit()
dict_writer = csv.DictWriter(outputfile, keys, lineterminator='\n')
dict_writer.writer.writerow(keys)
dict_writer.writerows(sortedrows)
outputfile.close()
print("File is Done!! Check your folder")
if __name__ == '__main__':
start_time = time.clock()
main(sys.argv)
print("\n")
print time.clock() - start_time, "seconds for script time"
Any idea how to finish the code so that it will work? Thank you!
Try to add referrer, userip as described in the docs:
An area to pay special attention to
relates to correctly identifying
yourself in your requests.
Applications MUST always include a
valid and accurate http referer header
in their requests. In addition, we
ask, but do not require, that each
request contains a valid API Key. By
providing a key, your application
provides us with a secondary
identification mechanism that is
useful should we need to contact you
in order to correct any problems. Read
more about the usefulness of having an
API key
Developers are also encouraged to make
use of the userip parameter (see
below) to supply the IP address of the
end-user on whose behalf you are
making the API request. Doing so will
help distinguish this legitimate
server-side traffic from traffic which
doesn't come from an end-user.
Here's an example based on the answer to the question "access to google with python":
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import urllib, urllib2
from pprint import pprint
api_key, userip = None, None
query = {'q' : 'матрёшка'}
referrer = "https://stackoverflow.com/q/4309599/4279"
if userip:
query.update(userip=userip)
if api_key:
query.update(key=api_key)
url = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&%s' %(
urllib.urlencode(query))
request = urllib2.Request(url, headers=dict(Referer=referrer))
json_data = json.load(urllib2.urlopen(request))
pprint(json_data['responseData'])
Output
{u'confidence': 0.070496580000000003, u'isReliable': False, u'language': u'ru'}
Another issue might be that seedterm is not properly quoted:
if isinstance(seedterm, unicode):
value = seedterm
else: # bytes
value = seedterm.decode(put_encoding_here)
url = 'http://...q=%s' % urllib.quote_plus(value.encode('utf-8'))

Categories

Resources