Is there a way to insert json into postgres database using pycopg2? - python

I'm trying to insert the following data into a postgres database
{
"id": 131739425477632000,
"user_name": "KithureKindiki",
"content": "#Fchurii You're right, Francis.",
"deleted": 1,
"created": "2011-11-02 14:28:21",
"modified": "2019-01-10 13:05:42",
"tweet": "{\"contributors\": null, \"truncated\": false, \"text\": \"#Fchurii You're right, Francis.\", \"is_quote_status\": false, \"in_reply_to_status_id\": 131738250736971778, \"id\": 131739425477632000, \"favorite_count\": 0, \"source\": \"Twitter Web Client\", \"retweeted\": false, \"coordinates\": null, \"entities\": {\"symbols\": [], \"user_mentions\": [{\"indices\": [0, 8], \"id_str\": \"284946979\", \"screen_name\": \"Fchurii\", \"name\": \"Francis Gachuri\", \"id\": 284946979}], \"hashtags\": [], \"urls\": []}, \"in_reply_to_screen_name\": \"Fchurii\", \"in_reply_to_user_id\": 284946979, \"retweet_count\": 0, \"id_str\": \"131739425477632000\", \"favorited\": false, \"user\": {\"follow_request_sent\": false, \"has_extended_profile\": false, \"profile_use_background_image\": true, \"contributors_enabled\": false, \"id\": 399935104, \"verified\": false, \"translator_type\": \"none\", \"profile_text_color\": \"333333\", \"profile_image_url_https\": \"https://pbs.twimg.com/profile_images/538310980468764672/xpJnlD_-_normal.jpeg\", \"profile_sidebar_fill_color\": \"DDEEF6\", \"entities\": {\"description\": {\"urls\": []}}, \"followers_count\": 23555, \"profile_sidebar_border_color\": \"C0DEED\", \"id_str\": \"399935104\", \"default_profile_image\": false, \"listed_count\": 17, \"is_translation_enabled\": false, \"utc_offset\": null, \"statuses_count\": 246, \"description\": \"Majority Leader, The Senate of the Republic of Kenya\", \"friends_count\": 244, \"location\": \"\", \"profile_link_color\": \"1DA1F2\", \"profile_image_url\": \"http://pbs.twimg.com/profile_images/538310980468764672/xpJnlD_-_normal.jpeg\", \"notifications\": false, \"geo_enabled\": false, \"profile_background_color\": \"C0DEED\", \"profile_background_image_url\": \"http://abs.twimg.com/images/themes/theme1/bg.png\", \"screen_name\": \"KithureKindiki\", \"lang\": \"en\", \"following\": false, \"profile_background_tile\": false, \"favourites_count\": 11, \"name\": \"Kithure Kindiki\", \"url\": null, \"created_at\": \"Fri Oct 28 08:09:57 +0000 2011\", \"profile_background_image_url_https\": \"https://abs.twimg.com/images/themes/theme1/bg.png\", \"time_zone\": null, \"protected\": false, \"default_profile\": true, \"is_translator\": false}, \"geo\": null, \"in_reply_to_user_id_str\": \"284946979\", \"lang\": \"en\", \"created_at\": \"Wed Nov 02 14:28:21 +0000 2011\", \"in_reply_to_status_id_str\": \"131738250736971778\", \"place\": null}",
"politician_id": 41,
"approved": 1,
"reviewed": 1,
"reviewed_at": "2019-01-10 13:05:42",
"review_message": null,
"retweeted_id": null,
"retweeted_content": null,
"retweeted_user_name": null
}
using the following code
qwery = f"INSERT INTO deleted_tweets(id,user_name,content,deleted,created,modified,tweet,politician_id,approved,reviewed,reviewed_at,review_message,retweeted_id,retweeted_content,retweeted_user_name) VALUES {row['id'], row['user_name'], row['content'], bool(row['deleted']), row['created'], row['modified'],row['tweet'],row['politician_id'],bool(row['approved']), bool(row['reviewed']),row['reviewed_at'],row['review_message'],row['retweeted_id'],row['retweeted_content'],row['retweeted_user_name']}"
qwery = qwery.replace('None', 'null')
cursor.execute(qwery)
However, I get the following error
*** psycopg2.errors.SyntaxError: syntax error at or near "re"
LINE 1: ... null, "truncated": false, "text": "#Fchurii You\'re right, ...
I know this is due to the single quote but I'm not sure how to overcome it. I've tried adding backslash to the string something like \"text\": \"#Fchurii You\\'re right, Francis.\",
but still getting the same error. Any ideas on how to bypass this?

Try:
query = "INSERT INTO deleted_tweets (id,user_name,content,deleted,created,modified,tweet,politician_id,approved,reviewed,reviewed_at,review_message,retweeted_id,retweeted_content,retweeted_user_name) VALUES (%s)"
data = [row['id'], row['user_name'], row['content'], bool(row['deleted']), row['created'], row['modified'], row['tweet'], row['politician_id'], bool(row['approved']), bool(row['reviewed']), row['reviewed_at'], row['review_message'], row['retweeted_id'], row['retweeted_content'], row['retweeted_user_name']]
data_without_nulls = ['null' if x is None else x for x in data]
cursor.execute(query, data_without_nulls)

Related

sanitize unicode from json

how do I properly remove unicode so I can load the json
data = json.loads(json_string)
json.decoder.JSONDecodeError: Invalid \escape: line 1 column 72 (char 71)
{"user": {"user_id": 455830511, "username": "dualipa_384", "name": "Dua\xa0Lipa", "private": false, "verified_user": false, "avatar_url": "https://uploads.cdn.triller.co/v1/avatars/455830511/1619366527_avatar.jpg", "profile_cover_url": "None", "dm_registered": true, "storefront_url": "None", "creator_status": false, "contributor_status": false, "user_uuid": "bce20042-a143-4caf-adbc-6b39bbb2d30a", "about_me": "Go stream my new album Future Nostalgia The Moonlight Edition❤️\ndualipa.co/weregood-video", "auto_confirmed": true, "instagram_handle": "#dualipa", "instagram_verified": false, "soundcloud_url": "None", "button_text": "None", "button_text_color": "None", "button_background_color": "None", "button_url": "None", "follower_count": 0, "followed_count": 55, "verified": true, "failed_age_validation": false, "has_snaps": false, "profile_type": "public", "blocking_user": false, "blocked_by_user": false, "followed_by_me": "false", "follower_of_me": "false", "subscription": {"is_subscribed": false}}, "status": true}
I have tried to do the following but it did not work
json_string = json_string.replace(u'\xa0', u'')
json_string = unicodedata.normalize("NFKD", json_string)
There is a newline character within a string. JSON does not allow line breaks withing strings. Replace the line break with an escape sequence:
json.loads(json_string.replace('\n', r'\n'))
this how it worked for me
import json
import unicodedata
json_string = json.loads(json.dumps(json_string))
json_string = json_string.replace("\"false\"", "\"False\"").replace("false", "\"False\"").replace("true", "\"True\"").replace("\n", " ")
json_string = unicodedata.normalize("NFKD", json_string)
json_string = json_string.replace(u'\xa0', u'')
json_string = json_string.replace('\n', r'\n')
data = json.loads(json_string)
print(data)

Access nested dict key in following style: selected = {k: tweets[i]._json[k] for k in {'created_at', 'id', 'full_text'}

After hours of research I am kind of lost. No Problem seems to match mine.
The problems is the following:
I have a JSON containing all sorts of information about a tweet. Much of which is nested, meaning a JSON as a value for a key. The keys inside of the { } are the keys of which I want to retrieve the key-value pair. With 'first level keys' there is no problem whatsoever. It retrieves them just fine. But I dont know how to acces the 'deeper level' keys. I know how to access a lower level value, namely with dictObject['FirstLevelKey']['SecondLevelKey]. The Problem though is that this returns the value of this certain key and not the key itself. I somehow need to tell the code where exactly to find the key inside the brackets { }.
As an example: There is a 'First level' key inside of my main JSON(tweets[i]._json) named 'user' which has a JSON as a value containing the key 'geo_enabled'. How could I tell my Programm to retrieve this key the same way as my 'first level' keys 'created_at', 'id', 'full_text'?
I hope I was able to express my problem in an understandable manner. Thanks in advance.
selected = {k: tweets[i]._json[k] for k in {'created_at', 'id', 'full_text', tweets[i]._json['user']['geo_enabled']} obviously doesn't work
{"created_at": "Thu Dec 10 14:12:18 +0000 2020",
"id": 1337804994,
"id_str": "1337037427630804994",
"full_text": "hello",
"user": {
"id": 25360913,
"id_str": "25360913",
"translator_type": "none"
},
"geo": False,
"coordinates": False,
"retweeted": False,
"lang": "de"
}
Here is some info about accessing nested dictionary values: https://www.geeksforgeeks.org/python-nested-dictionary/
You can do it this way very simply:
tweet_dict = {
"created_at": "Thu Dec 10 14:12:18 +0000 2020",
"id": 1337804994,
"id_str": "1337037427630804994",
"full_text": "hello",
"user": {
"id": 25360913,
"id_str": "25360913",
"translator_type": "none"
},
"geo": False,
"coordinates": False,
"retweeted": False,
"lang": "de"
}
new_dict = {
**tweet_dict, # unpack the tweet dict
'user_id': tweet_dict['user']['id'], # add the user_id key
}
# pretty print the output
from pprint import pprint
pprint(new_dict)
Or without creating a new dict:
tweet_dict['user_id'] = tweet_dict['user']['id'], # add the user_id key
Output:
{'coordinates': False,
'created_at': 'Thu Dec 10 14:12:18 +0000 2020',
'full_text': 'hello',
'geo': False,
'id': 1337804994,
'id_str': '1337037427630804994',
'lang': 'de',
'retweeted': False,
'user': {'id': 25360913, 'id_str': '25360913', 'translator_type': 'none'},
'user_id': 25360913}

How to access JSON object within JSON array python?

I am using a wrapper for Reddit's API to return information about comments. The way the information is returned is very confusing to me and I am having trouble getting the info I need.
So the API returns information in this format:
comment(all_awardings=[], associated_award=None, author='raidoctober', author_flair_background_color=None, author_flair_css_class=None, author_flair_richtext=[], author_flair_template_id=None, author_flair_text=None, author_flair_text_color=None, author_flair_type='text', author_fullname='t2_1ekqex92', author_patreon_flair=False, author_premium=False, awarders=[], body="Haha, yeah I thought about it. But it's probably not worth it cause of all the taxes, copart fees, cost of turning a Salvage title into a rebuilt and the insurance deductible.", collapsed_because_crowd_control=None, created_utc=1591296781, gildings={}, id='fsw0scp', is_submitter=True, link_id='t3_gwn3rw', locked=False, no_follow=True, parent_id='t1_fsvyhq1', permalink='/r/motorcycles/comments/gwn3rw/did_copart_steal_my_motorcycle/fsw0scp/', retrieved_on=1591301318, score=1, send_replies=True, stickied=False, subreddit='motorcycles', subreddit_id='t5_2qi6d', top_awarded_type=None, total_awards_received=0, treatment_tags=[], created=1591321981.0, d_={'all_awardings': [], 'associated_award': None, 'author': 'raidoctober', 'author_flair_background_color': None, 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_template_id': None, 'author_flair_text': None, 'author_flair_text_color': None, 'author_flair_type': 'text', 'author_fullname': 't2_1ekqex92', 'author_patreon_flair': False, 'author_premium': False, 'awarders': [], 'body': "Haha, yeah I thought about it. But it's probably not worth it cause of all the taxes, copart fees, cost of turning a Salvage title into a rebuilt and the insurance deductible.", 'collapsed_because_crowd_control': None, 'created_utc': 1591296781, 'gildings': {}, 'id': 'fsw0scp', 'is_submitter': True, 'link_id': 't3_gwn3rw', 'locked': False, 'no_follow': True, 'parent_id': 't1_fsvyhq1', 'permalink': '/r/motorcycles/comments/gwn3rw/did_copart_steal_my_motorcycle/fsw0scp/', 'retrieved_on': 1591301318, 'score': 1, 'send_replies': True, 'stickied': False, 'subreddit': 'motorcycles', 'subreddit_id': 't5_2qi6d', 'top_awarded_type': None, 'total_awards_received': 0, 'treatment_tags': [], 'created': 1591321981.0})
I tried to convert to JSON using
x = json.dumps(hit, sort_keys=True, indent=4)
# hit is the information returned (it is the comment before conversion)
which converts the comment into this JSON format:
[
[],
null,
"raidoctober",
null,
null,
[],
null,
null,
null,
"text",
"t2_1ekqex92",
false,
false,
[],
"Haha, yeah I thought about it. But it's probably not worth it cause of all the taxes, copart fees, cost of turning a Salvage title into a rebuilt and the insurance deductible.",
null,
1591296781,
{},
"fsw0scp",
true,
"t3_gwn3rw",
false,
true,
"t1_fsvyhq1",
"/r/motorcycles/comments/gwn3rw/did_copart_steal_my_motorcycle/fsw0scp/",
1591301318,
1,
true,
false,
"motorcycles",
"t5_2qi6d",
null,
0,
[],
1591321981.0,
{
"all_awardings": [],
"associated_award": null,
"author": "raidoctober",
"author_flair_background_color": null,
"author_flair_css_class": null,
"author_flair_richtext": [],
"author_flair_template_id": null,
"author_flair_text": null,
"author_flair_text_color": null,
"author_flair_type": "text",
"author_fullname": "t2_1ekqex92",
"author_patreon_flair": false,
"author_premium": false,
"awarders": [],
"body": "Haha, yeah I thought about it. But it's probably not worth it cause of all the taxes, copart fees, cost of turning a Salvage title into a rebuilt and the insurance deductible.",
"collapsed_because_crowd_control": null,
"created": 1591321981.0,
"created_utc": 1591296781,
"gildings": {},
"id": "fsw0scp",
"is_submitter": true,
"link_id": "t3_gwn3rw",
"locked": false,
"no_follow": true,
"parent_id": "t1_fsvyhq1",
"permalink": "/r/motorcycles/comments/gwn3rw/did_copart_steal_my_motorcycle/fsw0scp/",
"retrieved_on": 1591301318,
"score": 1,
"send_replies": true,
"stickied": false,
"subreddit": "motorcycles",
"subreddit_id": "t5_2qi6d",
"top_awarded_type": null,
"total_awards_received": 0,
"treatment_tags": []
}
]
I've tried indexing to access it but sometimes the size of the array is different so the results were inaccurate.
I need the "author", "body", and "permalink" tags.
I'm sorry if this is too vague! If you need more information/clarification please let me know.
Does this help?
hit = [
[],
None,
"raidoctober",
None,
None,
[],
None,
None,
None,
"text",
"t2_1ekqex92",
False,
False,
[],
"Haha, yeah I thought about it. But it's probably not worth it cause of all the taxes, copart fees, cost of turning a Salvage title into a rebuilt and the insurance deductible.",
None,
1591296781,
{},
"fsw0scp",
True,
"t3_gwn3rw",
False,
True,
"t1_fsvyhq1",
"/r/motorcycles/comments/gwn3rw/did_copart_steal_my_motorcycle/fsw0scp/",
1591301318,
1,
True,
False,
"motorcycles",
"t5_2qi6d",
None,
0,
[],
1591321981.0,
{
"all_awardings": [],
"associated_award": None,
"author": "raidoctober",
"author_flair_background_color": None,
"author_flair_css_class": None,
"author_flair_richtext": [],
"author_flair_template_id": None,
"author_flair_text": None,
"author_flair_text_color": None,
"author_flair_type": "text",
"author_fullname": "t2_1ekqex92",
"author_patreon_flair": False,
"author_premium": False,
"awarders": [],
"body": "Haha, yeah I thought about it. But it's probably not worth it cause of all the taxes, copart fees, cost of turning a Salvage title into a rebuilt and the insurance deductible.",
"collapsed_because_crowd_control": None,
"created": 1591321981.0,
"created_utc": 1591296781,
"gildings": {},
"id": "fsw0scp",
"is_submitter": True,
"link_id": "t3_gwn3rw",
"locked": False,
"no_follow": True,
"parent_id": "t1_fsvyhq1",
"permalink": "/r/motorcycles/comments/gwn3rw/did_copart_steal_my_motorcycle/fsw0scp/",
"retrieved_on": 1591301318,
"score": 1,
"send_replies": True,
"stickied": False,
"subreddit": "motorcycles",
"subreddit_id": "t5_2qi6d",
"top_awarded_type": None,
"total_awards_received": 0,
"treatment_tags": []
}
]
for item in hit:
if type(item) is dict:
if "author" in item and "body" in item and "permalink" in item:
reqd_dict = {"author": item['author'], "body": ['body'], "permalink": ['permalink']}
print("Found it!!")

Sublime Text 3 adds spaces after saving file

I have a problem with Sublime Text 3 when I saving file with Python syntax. The spaces near to the operators are add automatically.
For example:
I type: print(a," ",b+c) and after saving file it will be: print(a, " ", b + c)
I try change settings but I don't know what does spaces after CTRL+S. I want white spaces but I don't want auto adding spaces in places where I don't type spaces.
My settings file:
{
"always_show_minimap_viewport": true,
"auto_complete": false,
"auto_complete_commit_on_tab": true,
"auto_match_enabled": true,
"bold_folder_labels": true,
"caret_style": "solid",
"color_scheme": "Packages/ayu/ayu-dark.tmTheme",
"detect_indentation": true,
"draw_indent_guides": true,
"draw_minimap_border": true,
"enable_telemetry": false,
"ensure_newline_at_eof_on_save": true,
"folder_exclude_patterns":
[
".svn",
".git",
".hg",
"CVS",
"*.DS_Store",
"*.pyc",
"pycache"
],
"font_face": "Liberation Mono",
"font_options":
[
"subpixel_antialias",
"no_round"
],
"font_size": 14,
"highlight_line": false,
"highlight_modified_tabs": false,
"ignored_packages":
[
"Vintage"
],
"indent_guide_options":
[
"draw_active",
"draw_normal"
],
"indent_to_bracket": true,
"line_padding_bottom": 0,
"line_padding_top": 0,
"match_brackets": true,
"match_brackets_angle": false,
"match_brackets_braces": true,
"match_brackets_content": true,
"match_brackets_square": true,
"new_window_settings":
{
"hide_open_files": true,
"show_tabs": true,
"side_bar_visible": true,
"status_bar_visible": true
},
"pep8_max_line_length": 79,
"preview_on_click": false,
"rulers":
[
79
],
"shift_tab_unindent": true,
"show_panel_on_build": false,
"soda_classic_tabs": true,
"soda_folder_icons": false,
"tab_size": 4,
"theme": "ayu-dark.sublime-theme",
"translate_tabs_to_spaces": true,
"trim_trailing_white_space_on_save": true,
"use_simple_full_screen": true,
"vintage_start_in_command_mode": false,
"wide_caret": true,
"word_wrap": true,
"wrap_width": 80,
"material_theme_accent_graphite": true ,
"material_theme_compact_sidebar": true,
"file_exclude_patterns": ["*.pyc", "*.pyo", "*.exe", "*.dll", "*.obj","*.o", "*.a", "*.lib", "*.so", "*.dylib", "*.ncb", "*.sdf", "*.suo", "*.pdb", "*.idb", ".DS_Store", "*.class", "*.psd", "*.sublime-workspace"],
}
Syntax-settings file:
// These settings override both User and Default settings for the Python syntax
{
"draw_white_space": "all",
"auto_indent": true,
"rulers": [79],
"smart_indent": true,
"tab_size": 4,
"trim_automatic_white_space": true,
"use_tab_stops": true,
"word_wrap": true,
"wrap_width": 80
}
I found a solution. It was a Anaconda plugin.
You must set the following flag in the user's settings of this plugin:
"auto_formatting": true,
"autoformat_ignore":
[
"E309",
"E501",
"E221",
"E222",
"E223",
"E225",
"E226",
"E227",
"E228",
"E231"
]

Trouble parsing Twitter API JSON output for User profile data with Python

I thought this other SO thread would have answered my question (http://stackoverflow.com/questions/4883751/trouble-reading-json-object-in-python), as it is very similar to my problem, but the data there are a little different than the data in my case.
I have about 470 records pulled from the Twitter API for twitter user data, something like:
{
steve: {
follow_request_sent: false,
profile_use_background_image: true,
default_profile_image: false,
geo_enabled: true,
verified: false,
profile_image_url_https: "https://si0.twimg.com/profile_images/1416115378/profile_normal.jpg",
profile_sidebar_fill_color: "F8E846",
id: 1376271,
profile_text_color: "000000",
followers_count: 2042,
profile_sidebar_border_color: "FFFFFF",
location: "Dallas and 51°33′28″N 0°6′10″W",
profile_background_color: "7d0000",
listed_count: 110,
status: {
favorited: false,
contributors: null,
truncated: false,
text: "So Microsoft's cloud is down. Can't say I have noticed. To the cloud! (the Amazon one of course)",
created_at: "Wed Feb 29 15:51:44 +0000 2012",
retweeted: false,
in_reply_to_status_id: null,
coordinates: null,
id: 174884564718723070,
source: "TweetDeck",
in_reply_to_status_id_str: null,
in_reply_to_screen_name: null,
id_str: "174884564718723073",
place: null,
retweet_count: 0,
geo: null,
in_reply_to_user_id_str: null,
in_reply_to_user_id: null
},
utc_offset: -21600,
statuses_count: 11504,
description: "Network engineer. Cisco, Juniper, F5, HP, EMC, etc. If it is in the data center I deal with it. Arsenal and Mavericks supporter to the max over at #steverossen",
friends_count: 822,
profile_link_color: "0000ff",
profile_image_url: "http://a0.twimg.com/profile_images/1416115378/profile_normal.jpg",
is_translator: false,
show_all_inline_media: false,
profile_background_image_url_https: "https://si0.twimg.com/profile_background_images/192104695/stadium.jpg",
id_str: "1376271",
profile_background_image_url: "http://a2.twimg.com/profile_background_images/192104695/stadium.jpg",
screen_name: "steve",
lang: "en",
profile_background_tile: false,
favourites_count: 0,
name: "Steve Rossen",
notifications: false,
url: "http://steverossen.com",
created_at: "Sat Mar 17 21:36:32 +0000 2007",
contributors_enabled: false,
time_zone: "Central Time (US & Canada)",
protected: false,
default_profile: false,
following: false
},
}
the problem being that each record starts with the person's twitter handle so is different for each record. So I've only been able to get so far as using:
import json
import csv
f = open('my.json')
data = json.load(f)
f.close()
for item in data:
print item
to print out those handles but can't figure out how to get into each person's record without having a key.
what am I grossly overlooking here? I would atleast like to get at the "description", which is nested inside of the users name as a key.
Maybe I'm missing what exactly you are looking for, but couldn't you do this:
import json
f = open('my.json')
data = json.load(f)
f.close()
for key in data.keys():
print data[key]["description"]

Categories

Resources