Column name with "." to be replaced with "_" using Python - python

Below code reads a JSON file and stores it into DB Table.
But few column names are generated with ".".
Only these particular column names has to be replaced with underscore "_". If at all "." is encountered in any column names before pushing to table. Data shouldn't be changed, but only column names with "." to "_" in my python code.
Below is what I tried and not sure how to replace the column names and then push to DB:
import pandas as pd
import json
import sys
import psycopg2
data = sys.argv[1]
user= sys.argv[2]
password = sys.argv[3]
host = sys.argv[4]
port = sys.argv[5]
db = sys.argv[6]
documenttype = sys.argv[7]
schema_name = sys.argv[8]
from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://'+user+':'+password+'#'+host+':'+port+'/'+db)
print("Database is connected")
df = pd.read_json(data)
df['RecordsNew'] = df['Records'].astype('|S80')
df_1 = pd.json_normalize(df['Records'])
df_1.columns = map(str.lower, df_1.columns)
table_name =documenttype.lower()
df_1.to_sql(table_name,schema=schema_name,con=engine, if_exists = 'append',index=False)
JSON code example:
{
"Records": [
{
"CommodityId": "3470",
"SourceSystem": "SSP-generic-CHILD4",
"CommodityName": "ANCHOR BOLTS - BILL OF MATERIALS",
"CommodityType": ""
},
{
"CommodityId": "468657",
"SourceSystem": "SSP-generic-CHILD4",
"CommodityName": "COOLING INSERT",
"CommodityType": ""
},
{
"CommodityId": "836519",
"SourceSystem": "SSP-generic-CHILD4",
"CommodityName": "DIAPHRAGM 2ND STAGE PGT25",
"CommodityType": ""
},
{
"CommodityId": "807525",
"SourceSystem": "SSP-generic-CHILD4",
"CommodityName": "MOBILE NOZZLE MACHINING 2ST MS5002C(S2N)",
"CommodityType": ""
"ReconciledBy":{"SourceSystem":"SSP-sap-CHILD1","UserId":"","PasswordAdapter":""}
},
...
],
"PageToken": "TlhXQ0FVcTlzNE8rQQ"
}
Example of column names generated with "."
"reconciledby.passwordadapter"
"procurementunit.sourcesystem"
"procurementunit.uniquename"
"sourcesystem.sourcesystemid"

Related

Trying to insert from JSON file to database

I have to do the yelp API of a Django web app. I created db() to enter data into database but how to address the error? I'm trying to do it without Pandas:
Message=string indices must be integers
Source=C:\Users\diggt\OneDrive\College\Rowan\Fall22\10430_computing_and_informatics_capstone\yelp_VSCode\yelp.py
StackTrace:
File "C:\Users\diggt\OneDrive\College\Rowan\Fall22\10430_computing_and_informatics_capstone\yelp_VSCode\yelp.py", line 104, in <genexpr>
keys = (entry[c] for c in columns)
File "C:\Users\diggt\OneDrive\College\Rowan\Fall22\10430_computing_and_informatics_capstone\yelp_VSCode\yelp.py", line 115, in db
cur.executemany(sql, keys)
File "C:\Users\diggt\OneDrive\College\Rowan\Fall22\10430_computing_and_informatics_capstone\yelp_VSCode\yelp.py", line 153, in main
db()
File "C:\Users\diggt\OneDrive\College\Rowan\Fall22\10430_computing_and_informatics_capstone\yelp_VSCode\yelp.py", line 157, in <module> (Current frame)
main()
# -*- coding: utf-8 -*-
from __future__ import print_function
import argparse
import json
import csv
import pprint
import requests
import sys
import sqlite3
#import pandas as pd
from urllib.error import HTTPError
from urllib.parse import quote
API_KEY = 'secret'
# API constants, you shouldn't have to change these.
API_HOST = 'https://api.yelp.com'
SEARCH_PATH = '/v3/businesses/search'
BUSINESS_PATH = '/v3/businesses/' # Business ID will come after slash.
# Defaults
DEFAULT_TERM = 'dinner'
DEFAULT_LOCATION = 'Glassboro, NJ'
SEARCH_LIMIT = 3
OFFSET = 0
def request(host, path, api_key, url_params=None):
url_params = url_params or {}
url = '{0}{1}'.format(host, quote(path.encode('utf8')))
headers = {
'Authorization': 'Bearer %s' % api_key,
}
print(u'Querying {0} ...'.format(url))
response = requests.request('GET', url, headers=headers, params=url_params)
return response.json()
def search(api_key, term, location):
url_params = {
'term': term.replace(' ', '+'),
'location': location.replace(' ', '+'),
'limit': SEARCH_LIMIT,
'offset': OFFSET
}
return request(API_HOST, SEARCH_PATH, api_key, url_params=url_params)
def get_business(api_key, business_id):
business_path = BUSINESS_PATH + business_id
return request(API_HOST, business_path, api_key)
def query_api(term, location):
response = search(API_KEY, term, location)
businesses = response.get('businesses')
if not businesses:
print(u'No businesses for {0} in {1} found.'.format(term, location))
return
business_id = businesses[0]['id']
print(u'{0} businesses found, querying business info ' \
'for the top result "{1}" ...'.format(
len(businesses), business_id))
response = get_business(API_KEY, business_id)
print(u'Result for business "{0}" found:'.format(business_id))
pprint.pprint(response, indent=2)
str_to_write_to_file = json.dumps(response, skipkeys=True, allow_nan=True, indent=4)
with open('yelp.json', 'w') as f:
f.write(str_to_write_to_file)
def db():
with open('yelp.json', 'r') as f:
data = f.readlines()
conn = sqlite3.connect('yelp.db')
cur = conn.cursor()
# Create the table if it doesn't exist.
cur.execute(
"""CREATE TABLE IF NOT EXISTS yelp(
id INTEGER PRIMARY KEY,
alias varchar(100),
location varchar(100),
display_phone varchar(15)
);"""
)
for entry in data:
columns = ["id" "alias", "location", "display_phone"]
keys = (entry[c] for c in columns)
# Execute the command and replace '?' with the each value
# in 'values'. DO NOT build a string and replace manually.
# the sqlite3 library will handle non safe strings by doing this.
sql = """INSERT INTO yelp (id, alias, location, display_phone) VALUES(
?,
?,
?,
?
);"""
cur.executemany(sql, keys)
print(f'{entry["alias"]} data inserted Succefully')
conn.commit()
conn.close()
with sqlite3.connect("yelp.db") as conn:
cmd = """SELECT * FROM yelp;"""
cur = conn.execute(cmd)
res = cur.fetchall()
for r in res:
print(r)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-q', '--term', dest='term', default=DEFAULT_TERM,
type=str, help='Search term (default: %(default)s)')
parser.add_argument('-l', '--location', dest='location',
default=DEFAULT_LOCATION, type=str,
help='Search location (default: %(default)s)')
input_values = parser.parse_args()
try:
query_api(input_values.term, input_values.location)
except HTTPError as error:
sys.exit(
'Encountered HTTP error {0} on {1}:\n {2}\nAbort program.'.format(
error.code,
error.url,
error.read(),
)
)
db()
if __name__ == '__main__':
main()
JSON file :
{
"id": "umC69pkiPyk3qY7IB49ZYw",
"alias": "bosphorus-mediterranean-cuisine-glassboro",
"name": "Bosphorus Mediterranean Cuisine",
"image_url": "https://s3-media4.fl.yelpcdn.com/bphoto/G7VCO3tvx8NGPz5g0fSpMw/o.jpg",
"is_claimed": true,
"is_closed": false,
"url": "https://www.yelp.com/biz/bosphorus-mediterranean-cuisine-glassboro?adjust_creative=9aYQmmK21ApZ7TfokeTk1A&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_lookup&utm_source=9aYQmmK21ApZ7TfokeTk1A",
"phone": "+18562432015",
"display_phone": "(856) 243-2015",
"review_count": 14,
"categories": [
{
"alias": "turkish",
"title": "Turkish"
},
{
"alias": "halal",
"title": "Halal"
},
{
"alias": "kebab",
"title": "Kebab"
}
],
"rating": 5.0,
"location": {
"address1": "524 Delsea Drive N",
"address2": null,
"address3": null,
"city": "Glassboro",
"zip_code": "08028",
"country": "US",
"state": "NJ",
"display_address": [
"524 Delsea Drive N",
"Glassboro, NJ 08028"
],
"cross_streets": ""
},
"coordinates": {
"latitude": 39.7150351328115,
"longitude": -75.1118882
},
"photos": [
"https://s3-media4.fl.yelpcdn.com/bphoto/G7VCO3tvx8NGPz5g0fSpMw/o.jpg",
"https://s3-media2.fl.yelpcdn.com/bphoto/HvhYRZO2rOYUBX0DagVE3w/o.jpg",
"https://s3-media2.fl.yelpcdn.com/bphoto/PQHr3upfVULUjwz1M-ILcw/o.jpg"
],
"hours": [
{
"open": [
{
"is_overnight": false,
"start": "1100",
"end": "2200",
"day": 0
},
{
"is_overnight": false,
"start": "1100",
"end": "2200",
"day": 1
},
{
"is_overnight": false,
"start": "1100",
"end": "2200",
"day": 2
},
{
"is_overnight": false,
"start": "1100",
"end": "2200",
"day": 3
},
{
"is_overnight": false,
"start": "1100",
"end": "2200",
"day": 4
},
{
"is_overnight": false,
"start": "1100",
"end": "2200",
"day": 5
},
{
"is_overnight": false,
"start": "1100",
"end": "2200",
"day": 6
}
],
"hours_type": "REGULAR",
"is_open_now": true
}
],
"transactions": [
"pickup",
"delivery"
],
"messaging": {
"url": "https://www.yelp.com/raq/umC69pkiPyk3qY7IB49ZYw?adjust_creative=9aYQmmK21ApZ7TfokeTk1A&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_lookup&utm_source=9aYQmmK21ApZ7TfokeTk1A#popup%3Araq",
"use_case_text": "Message the Business"
}
}
You shouldn't use f.readlines() to read a JSON file, use json.load(f).
There's only one set of values in the JSON, so you don't need a loop or executemany().
def db():
with open('yelp.json', 'r') as f:
data = json.load(f)
conn = sqlite3.connect('yelp.db')
cur = conn.cursor()
# Create the table if it doesn't exist.
cur.execute(
"""CREATE TABLE IF NOT EXISTS yelp(
id INTEGER PRIMARY KEY,
alias varchar(100),
location varchar(100),
display_phone varchar(15)
);"""
)
columns = ["id" "alias", "location", "display_phone"]
keys = [entry[c] for c in columns]
# Execute the command and replace '?' with the each value
# in 'values'. DO NOT build a string and replace manually.
# the sqlite3 library will handle non safe strings by doing this.
sql = """INSERT INTO yelp (id, alias, location, display_phone) VALUES(
?,
?,
?,
?
);"""
cur.execute(sql, keys)
print(f'{entry["alias"]} data inserted Succefully')
conn.commit()
conn.close()
with sqlite3.connect("yelp.db") as conn:
cmd = """SELECT * FROM yelp;"""
cur = conn.execute(cmd)
res = cur.fetchall()
for r in res:
print(r)
So ultimately I figured it out... pretty much. I used what #Bramar said but the solution was making the json file an array and then I started getting this error sqlite3.ProgrammingError: Incorrect number of bindings supplied. The current statement uses 4, and there are 1 supplied. which turned out to be one of the entries that I had was stored in json as a dict so I eliminated it temporarily to see if I can make it work and it works, this is the code -
print(u'Result for business "{0}" found:'.format(business_id))
str_to_write_to_file = json.dumps([response], indent=4)
with open('yelp.json', 'w') as f:
f.write(str_to_write_to_file)
def db():
with open('yelp.json', 'r') as f:
data = json.load(f)
conn = sqlite3.connect('data/yelp.db')
cur = conn.cursor()
# Create the table if it doesn't exist.
cur.execute(
"""CREATE TABLE IF NOT EXISTS yelp(
id INTEGER PRIMARY KEY,
alias varchar(100),
display_phone varchar(15),
location dictionary
);"""
)
columns = ["alias", "display_phone"]
keys = [data[0][c] for c in columns]
# Execute the command and replace '?' with the each value
# in 'values'. DO NOT build a string and replace manually.
# the sqlite3 library will handle non safe strings by doing this.
sql = '''INSERT INTO yelp (alias, display_phone) VALUES(
?,
?
);'''
cur.execute(sql, keys)
conn.commit()
conn.close()
Hopefully this helps someone, this can very confusing.

Django - how can i insert '.json' file to SQLite DB?

my '.json file' like
{
"users": [
{
"userId": 1,
"firstName": "AAAAA",
"lastName": "as23",
"phoneNumber": "123456",
"emailAddress": "AAAAA#test.com",
"homepage": "https://amogg.tistory.com/1"
},
{
"userId": 2,
"firstName": "BBBB",
"lastName": "h5jdd",
"phoneNumber": "123456",
"homepage": "https://amogg.tistory.com/2"
},
{
"userId": 3,
...
i was search that to google, and try to this problem.. but unresolved.
so i use pandas and sqlite3
import sqlite3 as db
import pandas as pd
df = pd.read_json('test.json')
con = db.connect('./test.db')
df.to_sql('test', con=con)
so DB is created, but .json file data dont save in DB
how can solve this problem...?
You will have to create the table 'test' beforehand, iterate over the pandas dataframe df and insert the records into the table one by one:
import sqlite3 as db
import pandas as pd
df = pd.read_json('test.json', orient='index')
con = db.connect('./test.db')
cursor = con.cursor()
cursor.execute('''create table test (userId int primary key,
firstName text,
lastName text,
phoneNumber text,
emailAddress text,
homePage text)''')
for index, row in df.iterrows():
for element in row.iteritems():
try:
firstName = element[1]['firstName']
except:
firstName = ''
try:
lastName = element[1]['lastName']
except:
lastName = ''
try:
phoneNumber = element[1]['phoneNumber']
except:
phoneNumber = ''
try:
emailAddress = element[1]['emailAddress']
except:
emailAddress = ''
try:
homepage = element[1]['homepage']
except:
homepage = ''
cursor.execute("INSERT INTO test VALUES (?,?,?,?,?,?)", (element[1]['userId'],
firstName,
lastName,
phoneNumber,
emailAddress,
homepage))
con.commit()
con.close()
Since not all the records have the same valid values for all the columns, you will need to validate the existance of the column with a try/except and store an empty string if the column does not exist in the row.

Loading irregular json into Elasticsearch index with mapping using Python client

I have some .json where not all fields are present in all records, for e.g. caseclass.json looks like:
[{
"name" : "john smith",
"age" : 12,
"cars": ["ford", "toyota"],
"comment": "i am happy"
},
{
"name": "a. n. other",
"cars": "",
"comment": "i am panicking"
}]
Using Elasticsearch-7.6.1 via python client elasticsearch:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import json
import os
from elasticsearch_dsl import Document, Text, Date, Integer, analyzer
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
class Person(Document):
class Index:
using = es
name = 'person_index'
name = Text()
age = Integer()
cars = Text()
comment = Text(analyzer='snowball')
Person.init()
with open ("caseclass.json") as json_file:
data = json.load(json_file)
for indexid in range(len(data)):
document = Person(name=data[indexid]['name'], age=data[indexid]['age'], cars=data[indexid]['cars'], comment=data[indexid]['comment'])
document.meta.id = indexid
document.save()
Naturally I get KeyError: 'age' when the second record is trying to be read. My question is: it is possible to load such records onto a Elasticsearch index using the Python client and a pre-defined mapping, instead of dynamic mapping? Above code works if all fields are present in all records but is there a way to do this without checking presence of each field per record as the actual records have complex structure and there are millions of them? Thanks
The error has nothing to do w/ your mapping -- it's just telling you that age could not be accessed in one of your caseclasses.
The index mapping is created when you call Person.init() -- you can verify that by calling print(es.indices.get_mapping(Person.Index.name)) right after Person.init().
I've cleaned up your code a bit:
import json
import os
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Document, Text, Date, Integer, analyzer
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
class Person(Document):
class Index:
using = es
name = 'person_index'
name = Text()
age = Integer()
cars = Text()
comment = Text(analyzer='snowball')
Person.init()
print(es.indices.get_mapping(Person.Index.name))
with open("caseclass.json") as json_file:
data = json.load(json_file)
for indexid, case in enumerate(data):
document = Person(**case)
document.meta.id = indexid
document.save()
Notice how I used **case to spread all key-value pairs inside of a case instead of using data[property_key].
The generated mapping is as follows:
{
"person_index" : {
"mappings" : {
"properties" : {
"age" : {
"type" : "integer"
},
"cars" : {
"type" : "text"
},
"comment" : {
"type" : "text",
"analyzer" : "snowball"
},
"name" : {
"type" : "text"
}
}
}
}
}

How to create a filename with the current date and time in python when query is ran

When I run my query below, it creates a file called ‘mycsvfile’. However is there a way to add the current date and timestamp when the CSV file is created? For example if I run this query now the file should be named mycsvfile20171012 – 10:00:00 (something like that).
Could someone edit my code and show me how to do this please?
My code:
from elasticsearch import Elasticsearch
import csv
es = Elasticsearch(["9200"])
# Replace the following Query with your own Elastic Search Query
res = es.search(index="search", body=
{
"_source": ["DTDT", "TRDT", "SPLE", "RPLE"],
"query": {
"bool": {
"should": [
{"wildcard": {"CN": "TEST1"}}
]
}
}
}, size=10)
header_names = { 'DTDT': 'DATE', 'TRDT': 'TIME', ...}
with open('mycsvfile.csv', 'w') as f: # Just use 'w' mode in 3.x
header_present = False
for doc in res['hits']['hits']:
my_dict = doc['_source']
if not header_present:
w = csv.DictWriter(f, my_dict.keys())
w.writerow(header_names) # will write DATE, TIME, ... in correct place
header_present = True
w.writerow(my_dict)
Thank you in advance!
It is better to use underscore in filename than any other special character since it widely accepted
Therefore constructing file name as below :
csv_file = 'myfile_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
Use datetime as below :
from elasticsearch import Elasticsearch
import csv
es = Elasticsearch(["9200"])
# Replace the following Query with your own Elastic Search Query
res = es.search(index="search", body=
{
"_source": ["DTDT", "TRDT", "SPLE", "RPLE"],
"query": {
"bool": {
"should": [
{"wildcard": {"CN": "TEST1"}}
]
}
}
}, size=10)
from datetime import datetime
import os
file_path = <PASS YOUR FILE HERE>
csv_file = 'myfile_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
csv_file_full = os.path.join(file_path, os.sep, csv_file)
header_names = { 'DTDT': 'DATE', 'TRDT': 'TIME', ...}
with open(csv_file_full, 'w') as f: # Just use 'w' mode in 3.x
header_present = False
for doc in res['hits']['hits']:
my_dict = doc['_source']
if not header_present:
w = csv.DictWriter(f, my_dict.keys())
w.writerow(header_names) # will write DATE, TIME, ... in correct place
header_present = True
w.writerow(my_dict)
Yes, you can do like this:
However ":" is not supported in filenames so 20171010–10.00.00
>>> import time
>>> fname = lambda : "mycsvfile{}.csv".format(time.strftime("%Y%m%d-%H.%M.%S"))
>>>
>>> fname()
'mycsvfile20171012-17.24.59.csv'
>>> with open(fname()) as f:
>>> pass
Have a variable for file name as file_name and use datetime.now()
from datetime import datetime
file_name = 'mycsvfile' + str(datetime.now()) + '.csv'

How to write JSON data to Dynamodb by ignoring empty elements in boto3

I would like to write the following data group to Dynamodb.
There are about 100 data. Since images are not necessarily required, there is a mixture with and without the image_url element.
(questionsList.json)
{
"q_id" : "001",
"q_body" : "Where is the capital of the United States?",
"q_answer" : "Washington, D.C.",
"image_url" : "/Washington.jpg",
"keywords" : [
"UnitedStates",
"Washington"
]
},
{
"q_id" : "002",
"q_body" : "Where is the capital city of the UK?",
"q_answer" : "London",
"image_url" : "",
"keywords" : [
"UK",
"London"
]
},
Since it is the writing test phase, Dynamodb to write to is prepared in localhost:8000 using the serverless-dynamodb-local plugin of the serverless framework, not the production environment.
In order to write the above JSON data to this Dynamodb, I wrote the following code in Boto 3 (AWS SDK for Python).
from __future__ import print_function
import boto3
import codecs
import json
dynamodb = boto3.resource('dynamodb', region_name='us-east-1', endpoint_url="http://localhost:8000")
table = dynamodb.Table('questionListTable')
with open("questionList.json", "r", encoding='utf-8') as json_file:
items = json.load(json_file)
for item in items:
q_id = item['q_id']
q_body = item['q_body']
q_answer = item['q_answer']
image_url = item['image_url']
keywords = item['keywords']
print("Adding detail:", q_id, q_body)
table.put_item(
Item={
'q_id': q_id,
'q_body': q_body,
'q_answer': q_answer,
'image_url': image_url,
'keywords': keywords,
}
)
When this code is executed, the following error occurs in the null character part.
botocore.exceptions.ClientError: An error occurred (ValidationException) when calling the PutItem operation: One or more parameter values were invalid: An AttributeValue may not contain an empty string
Apparently it seems to be caused by JSON 's null character.
If you exclude the image_url containing the null character from the target of writing as below, the writing is completed without any problem.
from __future__ import print_function
import boto3
import codecs
import json
dynamodb = boto3.resource('dynamodb', region_name='us-east-1', endpoint_url="http://localhost:8000")
table = dynamodb.Table('questionListTable')
with open("questionList.json", "r", encoding='utf-8') as json_file:
items = json.load(json_file)
for item in items:
q_id = item['q_id']
q_body = item['q_body']
q_answer = item['q_answer']
#image_url = item['image_url']
keywords = item['keywords']
print("Adding detail:", q_id, q_body)
table.put_item(
Item={
'q_id': q_id,
'q_body': q_body,
'q_answer': q_answer,
#'image_url': image_url,
'keywords': keywords,
}
)
Since DynamoDB is NoSQL, there may be other methods that make good use of the characteristics, but how to correct the code to write the above data ignoring empty characters? I would like to say "if image_url exists, write it if it does not, ignore it."
Thank you.
I solved my problem. You can set null as follows.
from __future__ import print_function
import boto3
import codecs
import json
dynamodb = boto3.resource('dynamodb', region_name='ap-northeast-1', endpoint_url="http://localhost:8000")
table = dynamodb.Table('questionListTable')
with open("questionList.json", "r", encoding='utf-8_sig') as json_file:
items = json.load(json_file)
for item in items:
q_id = item['q_id']
q_body = item['q_body']
q_answer = item['q_answer']
image_url = item['image_url'] if item['image_url'] else None
keywords = item['keywords'] if item['keywords'] else None
print("Adding detail:", q_id, q_body)
table.put_item(
Item={
'q_id': q_id,
'q_body': q_body,
'q_answer': q_answer,
'image_url': image_url,
'keywords': keywords,
}
)
In order to check the situation of Dynamodb, use the offline plugin of the serverless framework to run the API Gateway in the local environment. When I actually called the API using Postman, Null was properly inserted in the value.
{
"q_id" : "001",
"q_body" : "Where is the capital of the United States?",
"q_answer" : "Washington, D.C.",
"image_url" : "/Washington.jpg",
"keywords" : [
"UnitedStates",
"Washington"
]
},
{
"q_id" : "002",
"q_body" : "Where is the capital city of the UK?",
"q_answer" : "London",
"image_url" : "null",
"keywords" : [
"UK",
"London"
]
},

Categories

Resources