Python Reddit API JSON issues (no PRAW) - python

I am trying to obtain replies to the comments on Threads. Here is what I have been able to accomplish by parsing JSON:
subreddit = 'wallstreetbets'
link = 'https://oauth.reddit.com/r/'+subreddit+'/hot'
hot = requests.get(link,headers = headers)
hot.json()
Here is output
{'kind': 'Listing',
'data': {'after': 't3_x8kidp',
'dist': 27,
'modhash': None,
'geo_filter': None,
'children': [{'kind': 't3',
'data': {'approved_at_utc': None,
'subreddit': 'wallstreetbets',
'selftext': '**Read [rules](https://www.reddit.com/r/wallstreetbets/wiki/contentguide), follow [Twitter](https://twitter.com/Official_WSB) and [IG](https://www.instagram.com/official_wallstreetbets/), join [Discord](https://discord.gg/wallstreetbets), see [ban bets](https://www.reddit.com/r/wallstreetbets/wiki/banbets)!**\n\n[dm mods because why not](https://www.reddit.com/message/compose/?to=/r/wallstreetbets)\n\n[Earnings Thread](https://wallstreetbets.reddit.com/x4ryjg)',
'author_fullname': 't2_bd6q5',
'saved': False,
'mod_reason_title': None,
'gilded': 0,
'clicked': False,
'title': 'What Are Your Moves Tomorrow, September 08, 2022',
'link_flair_richtext': [{'e': 'text', 't': 'Daily Discussion'}],
'subreddit_name_prefixed': 'r/wallstreetbets',
'hidden': False,
'pwls': 7,
'link_flair_css_class': 'daily',
'downs': 0,
'thumbnail_height': None,
'top_awarded_type': None,
'hide_score': False,
'name': 't3_x8ev67',
...
'created_utc': 1662594703.0,
'num_crossposts': 0,
'media': None,
'is_video': False}}],
'before': None}}
I then turned it into a data frame
df = pd.DataFrame()
for post in hot.json()['data']['children']:
df = df.append({
'subreddit' : post['data']['subreddit'],
'title': post['data']['title'],
'selftext': post['data']['selftext'],
'created_utc': post['data']['created_utc'],
'id': post['data']['id']
}, ignore_index = True)
With this, I was able to obtain a data frame like thisDataFrame
Then, to obtain the comments, I created a list with all the JSON script from the 26 posts, and then created a while loop to iterate through the json script.
supereme = len(list_of_comments)
indexy = pd.DataFrame()
while supereme > 0:
supereme -= 1
for g in range(0,len(list_of_comments[supereme]['data']['children'])-1):
indexy = pd.concat([indexy, pd.DataFrame.from_records([{
'body': list_of_comments[supereme]['data']['children'][g]['data']['body'],
'post_id': list_of_comments[supereme]['data']['children'][g]['data']['parent_id'] }])], ignore_index = True)
indexy
This gave me this: DataFrame
However, I am not able to obtain the replies to the comments. Any help? I tried to do this
posts = 26
for i in np.arange(0,27):
print('i',i)
if len(list_of_comments[i]['data']['children']) == 0:
continue
for j in np.arange(0,len(list_of_comments[i]['data']['children'])):
if len(list_of_comments[i]['data']['children'][j]['data']['replies']) == 0:
break
else:
print('j',len(list_of_comments[i]['data']['children'][j]['data']['replies']))
for z in np.arange(len(list_of_comments[i]['data']['children'][j]['data']['replies']['data']['children'])):
if len(list_of_comments[i]['data']['children'][j]['data']['replies']['data']['children']) == 0:
break
print('z',z)
print(list_of_comments[i]['data']['children'][j]['data']['replies']['data']['children'][z]['data']['body'])
The first loop kinda works but it doesn't count up properly to get all the replies to all the posts itll only pull like one or two. We don't want to use PRAW

x=len(list_of_comments)
replies = pd.DataFrame()
for i in range(0,len(list_of_comments)):
try:
for j in range(0, len(list_of_comments[x]['data']['children'])):
try:
for z in range(0, len(list_of_comments[x]['data']['children'][j]['data']['replies']['data']['children'])):
#print(list_of_comments[x]['data']['children'][j]['data']['replies']['data']['children'][z]['data']['body'])
#print(list_of_comments[x]['data']['children'][j]['data']['replies']['data']['children'][z]['data']['link_id'])
replies = pd.concat([replies, pd.DataFrame.from_records([{
'body': list_of_comments[x]['data']['children'][j]['data']['replies']['data']['children'][z]['data']['body'],
'post_id': list_of_comments[x]['data']['children'][j]['data']['replies']['data']['children'][z]['data']['link_id']
}])], ignore_index = True)
except:
pass
except:
continue

Related

Iterate through a nested dict inside of a list, with some missing keys

I need your guys' help on how to extract information from a nested dictionary inside a list. Here's the code to get the data:
import requests
import json
import time
all_urls = []
for x in range(5000,5010):
url = f'https://api.jikan.moe/v4/anime/{x}/full'
all_urls.append(url)
all_responses = []
for page_url in all_urls:
response = requests.get(page_url)
all_responses.append(response)
time.sleep(1)
print(all_responses)
data = []
for i in all_responses:
json_data = json.loads(i.text)
data.append(json_data)
print(data)
The sample of the extracted data is as follows:
[{'status': 404,
'type': 'BadResponseException',
'message': 'Resource does not exist',
'error': '404 on https://myanimelist.net/anime/5000/'},
{'status': 404,
'type': 'BadResponseException',
'message': 'Resource does not exist',
'error': '404 on https://myanimelist.net/anime/5001/'},
{'data': {'mal_id': 5002,
'url': 'https://myanimelist.net/anime/5002/Bari_Bari_Densetsu',
'images': {'jpg': {'image_url': 'https://cdn.myanimelist.net/images/anime/4/58873.jpg',
'small_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873t.jpg',
'large_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873l.jpg'},
'webp': {'image_url': 'https://cdn.myanimelist.net/images/anime/4/58873.webp',
'small_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873t.webp',
'large_image_url': 'https://cdn.myanimelist.net/images/anime/4/58873l.webp'}},
'trailer': {'youtube_id': None,
'url': None,
'embed_url': None,
'images': {'image_url': None,
'small_image_url': None,
'medium_image_url': None,
'large_image_url': None,
'maximum_image_url': None}},
'title': 'Bari Bari Densetsu',
'title_english': None,
'title_japanese': 'バリバリ伝説',
'title_synonyms': ['Baribari Densetsu',
......
I need to extract the title from the list of data. Any help is appreciated! Also, any recommendation on a better/simpler/cleaner code to extract the json data from an API is also greatly appreciated!
Firstly, no need to create multiple lists. You can do everything in one loop:
import requests
import json
data = []
for x in range(5000,5010):
page_url = f'https://api.jikan.moe/v4/anime/{x}/full'
response = requests.get(page_url)
json_data = json.loads(response.text)
data.append(json_data)
print(data)
Second, to address your problem, you have two options. You can use dict.get:
for dic in data:
title = dic.get('title', 'no title')
Or use the try/except pattern:
for dic in data:
try:
title = dic['title']
except KeyError:
# deal with case where dict has no title
pass

How to extract a couple of fields nested in response using python

I'm a python beginner. I would like to ask for help regarding the retrieve the response data. Here's my script:
import pandas as pd
import re
import time
import requests as re
import json
response = re.get(url, headers=headers, auth=auth)
data = response.json()
Here's a part of json response:
{'result': [{'display': '',
'closure_code': '',
'service_offer': 'Integration Platforms',
'updated_on': '2022-04-23 09:05:53',
'urgency': '2',
'business_service': 'Operations',
'updated_by': 'serviceaccount45',
'description': 'ALERT returned 400 but expected 200',
'sys_created_on': '2022-04-23 09:05:53',
'sys_created_by': 'serviceaccount45',
'subcategory': 'Integration',
'contact_type': 'Email',
'problem_type': 'Design: Availability',
'caller_id': '',
'action': 'create',
'company': 'aaaa',
'priority': '3',
'status': '1',
'opened': 'smith.j',
'assigned_to': 'doe.j',
'number': '123456',
'group': 'blabla',
'impact': '2',
'category': 'Business Application & Databases',
'caused_by_change': '',
'location': 'All Locations',
'configuration_item': 'Monitor',
},
I would like to extract the data only for one group = 'blablabla'. Then I would like to extract fields such as:
number = data['number']
group = data['group']
service_offer = data['service_offer']
updated = data['updated_on']
urgency = data['urgency']
username = data['created_by']
short_desc = data['description']
How it should be done?
I know that to check the first value I should use:
service_offer = data['result'][0]['service_offer']
I've tried to create a dictionary, but, I'm getting an error:
data_result = response.json()['result']
payload ={
number = data_result['number']
group = data_result['group']
service_offer = data_result['service_offer']
updated = data_result['updated_on']
urgency = data_result['urgency']
username = data_result['created_by']
short_desc = data_result['description']
}
TypeError: list indices must be integers or slices, not str:
So, I've started to create something like below., but I'm stuck:
get_data = []
if len(data) > 0:
for item in range(len(data)):
get_data.append(data[item])
May I ask for help?
If data is your decoded json response from the question then you can do:
# find group `blabla` in result:
g = next(d for d in data["result"] if d["group"] == "blabla")
# get data from the `blabla` group:
number = g["number"]
group = g["group"]
service_offer = g["service_offer"]
updated = g["updated_on"]
urgency = g["urgency"]
username = g["sys_created_by"]
short_desc = g["description"]
print(number, group, service_offer, updated, urgency, username, short_desc)
Prints:
123456 blabla Integration Platforms 2022-04-23 09:05:53 2 serviceaccount45 ALERT returned 400 but expected 200

Python, sorting extracted youtube data

I'm trying to retrieve the top 10 most watched videos in a youtube channel. I'm not quite sure how to do that, the results returns 10 videos of a certain timeframe I believe. Also, the y-axis('views') from the plotted bar graph is not in order. In summary, I need help plotting a graph of the number of views in relation to each video(len(10))
Obtaining statistics
youtube = build('youtube', 'v3', developerKey=api_key)
request = youtube.channels().list(
part='statistics',
id='UC-lHJZR3Gqxt24_Td_AJ5Yw'
)
#To get a response, use execute()
response = request.execute()
#List indices must be intergers or slices, not str
stats = response['items'][0]['statistics']
video_count = stats['videoCount']
contentdata = youtube.channels().list(
id='UC-lHJZR3Gqxm24_Vd_AJ5Yw',
part='contentDetails'
).execute()
playlist_id = contentdata['items'][0]['contentDetails']['relatedPlaylists']['uploads']
videos = []
next_page_token = None
while 1:
res = youtube.playlistItems().list(
playlistId=playlist_id,
part='snippet',
maxResults=50,
pageToken=next_page_token
).execute()
videos += res['items']
next_page_token = res.get('nextPageToken')
if next_page_token is None:
break
#Get video ID for each video
video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], videos))
#Get statistics for each video
stats = []
for i in range(0, len(video_ids), 40):
res = youtube.videos().list(
id=','.join(video_ids[i:i+40]),
part='statistics'
).execute()
stats+=res['items']
views, links = [], []
for i in range(len(videos[:10])):
try:
title = (videos[i]['snippet']['title'])
view = (stats[i]['statistics']['viewCount'])
link = f"<a href='{stats[i]['id']}'>{title}</a"
except KeyError:
continue
else:
views.append(view)
links.append(link)
Plotting
from youtube_bar import links, views
from plotly.graph_objs import Bar
from plotly import offline
#Create bar graph with data
data = [{
'type': 'bar',
'x': links,
'y': views,
'opacity': 0.6,
'marker': {
'color': 'rgb(150, 100, 20)',
'line': {'width':1.5, 'color': 'rgb(25, 25, 25)'}
},
}]
my_layout = {
'title': 'Top 10 most views for channel',
'titlefont': {'size':28},
'xaxis': {
'title': 'Videos',
'titlefont': {'size': 24},
'tickfont': {'size': 14},
},
'yaxis': {
'title': 'Views',
'titlefont': {'size': 24},
'tickfont': {'size': 14},
},
}
fig = {'data': data, 'layout': my_layout}
offline.plot(fig, filename='youtube_videos.html')
graph
The loop in your code:
for i in range(len(videos[:10])):
is taking videos from 1 to 10 only (most recent, that's why a 'timeframe'), you have to sort variable videos based on it's viewCount before this.

Can't fetch an item out of weird json content

I'm trying to get some items from json content. However, the structure of that json content is foreign to me and as a result I can't fetch the value of property out of it.
I've tried so far with:
import json
import requests
from bs4 import BeautifulSoup
link = 'https://www.zillow.com/homedetails/5958-SW-4th-St-Miami-FL-33144/43835884_zpid/'
def fetch_content(link):
content = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(content.text,"lxml")
item = soup.select_one("script#hdpApolloPreloadedData").text
print(json.loads(item)['apiCache'])
if __name__ == '__main__':
fetch_content(link)
The result I get running the above script is:
{"VariantQuery{\"zpid\":43835884}":{"property":{"zpid":43835884,"streetAddress":"5958 SW 4th St",
Which I can't further process for that weird key in front.
Expected output:
{"zpid":43835884,"streetAddress":"5958 SW 4th St", ----
How can I get the value of that property?
You can get zpid and address by their mangled json with:
json.loads(json.loads(item.text)['apiCache'])['VariantQuery{"zpid":43835884}']['property']['zpid']
Out[1889]: 43835884
json.loads(json.loads(item.text)['apiCache'])['VariantQuery{"zpid":43835884}']['property']['streetAddress']
Out[1890]: '5958 SW 4th St'
I noticed you can always get the zpid like this:
link = 'https://www.zillow.com/homedetails/5958-SW-4th-St-Miami-FL-33144/43835884_zpid/'
content = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(content.text,"lxml")
item = soup.select_one("script#hdpApolloPreloadedData").text
print(json.loads(item)['zpid'])
Just modify your function to the following. I also added another function (process_fetched_content()) to give you some more freedom. You could simply run it and it will take care of situations even when you have multiple keys that start with 'VariantQuery{"zpid":'. The final output is a dict with the keys being your zpid and the values being what you are looking for.
If you have a lot of zpid values, then this will let you accumulate them all together and then process them. The benefit is the list of keys is then the list of zpids you have.
Here's how you could use this code.
results = process_fetched_content(raw_dictionary = fetch_content(link, verbose=False))
print(results)
output:
{'43835884': {'zpid': 43835884, 'streetAddress': '5958 SW 4th St', 'zipcode': '33144', 'city': 'Miami', 'state': 'FL', 'latitude': 25.76661, 'longitude': -80.292801, 'price': 340000, 'dateSold': 1576875600000, 'bathrooms': 2, 'bedrooms': 3, 'livingArea': 1757, 'yearBuilt': 1973, 'lotSize': 4331, 'homeType': 'SINGLE_FAMILY', 'homeStatus': 'RECENTLY_SOLD', 'photoCount': 19, 'imageLink': 'https://photos.zillowstatic.com/p_g/IS7yxihwtuqmlq1000000000.jpg', 'daysOnZillow': 0, 'isFeatured': False, 'shouldHighlight': False, 'brokerId': 0, 'zestimate': 341336, 'rentZestimate': 2200, 'listing_sub_type': {}, 'priceReduction': '', 'isUnmappable': False, 'rentalPetsFlags': 128, 'mediumImageLink': 'https://photos.zillowstatic.com/p_c/IS7yxihwtuqmlq1000000000.jpg', 'isPreforeclosureAuction': False, 'homeStatusForHDP': 'RECENTLY_SOLD', 'priceForHDP': 340000, 'festimate': 341336, 'isListingOwnedByCurrentSignedInAgent': False, 'isListingClaimedByCurrentSignedInUser': False, 'hiResImageLink': 'https://photos.zillowstatic.com/p_f/IS7yxihwtuqmlq1000000000.jpg', 'watchImageLink': 'https://photos.zillowstatic.com/p_j/IS7yxihwtuqmlq1000000000.jpg', 'tvImageLink': 'https://photos.zillowstatic.com/p_m/IS7yxihwtuqmlq1000000000.jpg', 'tvCollectionImageLink': 'https://photos.zillowstatic.com/p_l/IS7yxihwtuqmlq1000000000.jpg', 'tvHighResImageLink': 'https://photos.zillowstatic.com/p_n/IS7yxihwtuqmlq1000000000.jpg', 'zillowHasRightsToImages': True, 'desktopWebHdpImageLink': 'https://photos.zillowstatic.com/p_h/IS7yxihwtuqmlq1000000000.jpg', 'isNonOwnerOccupied': False, 'hideZestimate': False, 'isPremierBuilder': False, 'isZillowOwned': False, 'currency': 'USD', 'country': 'USA', 'taxAssessedValue': 224131, 'streetAddressOnly': '5958 SW 4th St', 'unit': ' '}}
Code
import json
import requests
from bs4 import BeautifulSoup
link = 'https://www.zillow.com/homedetails/5958-SW-4th-St-Miami-FL-33144/43835884_zpid/'
def fetch_content(link, verbose=False):
content = requests.get(link,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(content.text,"lxml")
item = soup.select_one("script#hdpApolloPreloadedData").text
d = json.loads(item)['apiCache']
d = json.loads(d)
if verbose:
print(d)
return d
def process_fetched_content(raw_dictionary=None):
if raw_dictionary is not None:
keys = [k for k in raw_dictionary.keys() if k.startswith('VariantQuery{"zpid":')]
results = dict((k.split(':')[-1].replace('}',''), d.get(k).get('property', None)) for k in keys)
return results
else:
return None

recursively collect string blocks in python

I have a custom data file formatted like this:
{
data = {
friends = {
max = 0 0,
min = 0 0,
},
family = {
cars = {
van = "honda",
car = "ford",
bike = "trek",
},
presets = {
location = "italy",
size = 10,
travelers = False,
},
version = 1,
},
},
}
I want to collect the blocks of data, meaning string between each set of {} while maintaining a hierarhcy. This data is not a typical json format so that is not a possible solution.
My idea was to create a class object like so
class Block:
def __init__(self, header, children):
self.header = header
self.children = children
Where i would then loop through the data line by line 'somehow' collecting the necessary data so my resulting output would like something like this...
Block("data = {}", [
Block("friends = {max = 0 0,\n min = 0 0,}", []),
Block("family = {version = 1}", [...])
])
In short I'm looking for help on ways I can serialize this into useful data I can then easily manipulate. So my approach is to break into objects by using the {} as dividers.
If anyone has suggestions on ways to better approach this I'm all up for ideas. Thank you again.
So far I've just implemented the basic snippets of code
class Block:
def __init__(self, content, children):
self.content = content
self.children = children
def GetBlock(strArr=[]):
print len(strArr)
# blocks = []
blockStart = "{"
blockEnd = "}"
with open(filepath, 'r') as file:
data = file.readlines()
blocks = GetBlock(strArr=data)
You can create a to_block function that takes the lines from your file as an iterator and recursively creates a nested dictionary from those. (Of course you could also use a custom Block class, but I don't really see the benefit in doing so.)
def to_block(lines):
block = {}
for line in lines:
if line.strip().endswith(("}", "},")):
break
key, value = map(str.strip, line.split(" = "))
if value.endswith("{"):
value = to_block(lines)
block[key] = value
return block
When calling it, you have to strip the first line, though. Also, evaluating the "leafs" to e.g. numbers or strings is left as an excercise to the reader.
>>> to_block(iter(data.splitlines()[1:]))
{'data': {'family': {'version': '1,',
'cars': {'bike': '"trek",', 'car': '"ford",', 'van': '"honda",'},
'presets': {'travelers': 'False,', 'size': '10,', 'location': '"italy",'}},
'friends': {'max': '0 0,', 'min': '0 0,'}}}
Or when reading from a file:
with open("data.txt") as f:
next(f) # skip first line
res = to_block(f)
Alternatively, you can do some preprocessing to transform that string into a JSON(-ish) string and then use json.loads. However, I would not go all the way here but instead just wrap the values into "" (and replace the original " with ' before that), otherwise there is too much risk to accidentally turning a string with spaces into a list or similar. You can sort those out once you've created the JSON data.
>>> data = data.replace('"', "'")
>>> data = re.sub(r'= (.+),$', r'= "\1",', data, flags=re.M)
>>> data = re.sub(r'^\s*(\w+) = ', r'"\1": ', data, flags=re.M)
>>> data = re.sub(r',$\s*}', r'}', data, flags=re.M)
>>> json.loads(data)
{'data': {'family': {'version': '1',
'presets': {'size': '10', 'travelers': 'False', 'location': "'italy'"},
'cars': {'bike': "'trek'", 'van': "'honda'", 'car': "'ford'"}},
'friends': {'max': '0 0', 'min': '0 0'}}}
You can also do with ast or json with the help of regex substitutions.
import re
a = """{
data = {
friends = {
max = 0 0,
min = 0 0,
},
family = {
cars = {
van = "honda",
car = "ford",
bike = "trek",
},
presets = {
location = "italy",
size = 10,
travelers = False,
},
version = 1,
},
},
}"""
#with ast
a = re.sub("(\w+)\s*=\s*", '"\\1":', a)
a = re.sub(":\s*((?:\d+)(?: \d+)+)", lambda x:':[' + x.group(1).replace(" ", ",") + "]", a)
import ast
print ast.literal_eval(a)
#{'data': {'friends': {'max': [0, 0], 'min': [0, 0]}, 'family': {'cars': {'car': 'ford', 'bike': 'trek', 'van': 'honda'}, 'presets': {'travelers': False, 'location': 'italy', 'size': 10}, 'version': 1}}}
#with json
import json
a = re.sub(",(\s*\})", "\\1", a)
a = a.replace(":True", ":true").replace(":False", ":false").replace(":None", ":null")
print json.loads(a)
#{u'data': {u'friends': {u'max': [0, 0], u'min': [0, 0]}, u'family': {u'cars': {u'car': u'ford', u'bike': u'trek', u'van': u'honda'}, u'presets': {u'travelers': False, u'location': u'italy', u'size': 10}, u'version': 1}}}

Categories

Resources