Python, sorting extracted youtube data

Python, sorting extracted youtube data - python

I'm trying to retrieve the top 10 most watched videos in a youtube channel. I'm not quite sure how to do that, the results returns 10 videos of a certain timeframe I believe. Also, the y-axis('views') from the plotted bar graph is not in order. In summary, I need help plotting a graph of the number of views in relation to each video(len(10))
Obtaining statistics
youtube = build('youtube', 'v3', developerKey=api_key)
request = youtube.channels().list(
part='statistics',
id='UC-lHJZR3Gqxt24_Td_AJ5Yw'
)
#To get a response, use execute()
response = request.execute()
#List indices must be intergers or slices, not str
stats = response['items'][0]['statistics']
video_count = stats['videoCount']
contentdata = youtube.channels().list(
id='UC-lHJZR3Gqxm24_Vd_AJ5Yw',
part='contentDetails'
).execute()
playlist_id = contentdata['items'][0]['contentDetails']['relatedPlaylists']['uploads']
videos = []
next_page_token = None
while 1:
res = youtube.playlistItems().list(
playlistId=playlist_id,
part='snippet',
maxResults=50,
pageToken=next_page_token
).execute()
videos += res['items']
next_page_token = res.get('nextPageToken')
if next_page_token is None:
break
#Get video ID for each video
video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], videos))
#Get statistics for each video
stats = []
for i in range(0, len(video_ids), 40):
res = youtube.videos().list(
id=','.join(video_ids[i:i+40]),
part='statistics'
).execute()
stats+=res['items']
views, links = [], []
for i in range(len(videos[:10])):
try:
title = (videos[i]['snippet']['title'])
view = (stats[i]['statistics']['viewCount'])
link = f"<a href='{stats[i]['id']}'>{title}</a"
except KeyError:
continue
else:
views.append(view)
links.append(link)
Plotting
from youtube_bar import links, views
from plotly.graph_objs import Bar
from plotly import offline
#Create bar graph with data
data = [{
'type': 'bar',
'x': links,
'y': views,
'opacity': 0.6,
'marker': {
'color': 'rgb(150, 100, 20)',
'line': {'width':1.5, 'color': 'rgb(25, 25, 25)'}
},
}]
my_layout = {
'title': 'Top 10 most views for channel',
'titlefont': {'size':28},
'xaxis': {
'title': 'Videos',
'titlefont': {'size': 24},
'tickfont': {'size': 14},
},
'yaxis': {
'title': 'Views',
'titlefont': {'size': 24},
'tickfont': {'size': 14},
},
}
fig = {'data': data, 'layout': my_layout}
offline.plot(fig, filename='youtube_videos.html')
graph

The loop in your code:
for i in range(len(videos[:10])):
is taking videos from 1 to 10 only (most recent, that's why a 'timeframe'), you have to sort variable videos based on it's viewCount before this.

Related

Python Reddit API JSON issues (no PRAW)

I am trying to obtain replies to the comments on Threads. Here is what I have been able to accomplish by parsing JSON:
subreddit = 'wallstreetbets'
link = 'https://oauth.reddit.com/r/'+subreddit+'/hot'
hot = requests.get(link,headers = headers)
hot.json()
Here is output
{'kind': 'Listing',
'data': {'after': 't3_x8kidp',
'dist': 27,
'modhash': None,
'geo_filter': None,
'children': [{'kind': 't3',
'data': {'approved_at_utc': None,
'subreddit': 'wallstreetbets',
'selftext': '**Read [rules](https://www.reddit.com/r/wallstreetbets/wiki/contentguide), follow [Twitter](https://twitter.com/Official_WSB) and [IG](https://www.instagram.com/official_wallstreetbets/), join [Discord](https://discord.gg/wallstreetbets), see [ban bets](https://www.reddit.com/r/wallstreetbets/wiki/banbets)!**\n\n[dm mods because why not](https://www.reddit.com/message/compose/?to=/r/wallstreetbets)\n\n[Earnings Thread](https://wallstreetbets.reddit.com/x4ryjg)',
'author_fullname': 't2_bd6q5',
'saved': False,
'mod_reason_title': None,
'gilded': 0,
'clicked': False,
'title': 'What Are Your Moves Tomorrow, September 08, 2022',
'link_flair_richtext': [{'e': 'text', 't': 'Daily Discussion'}],
'subreddit_name_prefixed': 'r/wallstreetbets',
'hidden': False,
'pwls': 7,
'link_flair_css_class': 'daily',
'downs': 0,
'thumbnail_height': None,
'top_awarded_type': None,
'hide_score': False,
'name': 't3_x8ev67',
...
'created_utc': 1662594703.0,
'num_crossposts': 0,
'media': None,
'is_video': False}}],
'before': None}}
I then turned it into a data frame
df = pd.DataFrame()
for post in hot.json()['data']['children']:
df = df.append({
'subreddit' : post['data']['subreddit'],
'title': post['data']['title'],
'selftext': post['data']['selftext'],
'created_utc': post['data']['created_utc'],
'id': post['data']['id']
}, ignore_index = True)
With this, I was able to obtain a data frame like thisDataFrame
Then, to obtain the comments, I created a list with all the JSON script from the 26 posts, and then created a while loop to iterate through the json script.
supereme = len(list_of_comments)
indexy = pd.DataFrame()
while supereme > 0:
supereme -= 1
for g in range(0,len(list_of_comments[supereme]['data']['children'])-1):
indexy = pd.concat([indexy, pd.DataFrame.from_records([{
'body': list_of_comments[supereme]['data']['children'][g]['data']['body'],
'post_id': list_of_comments[supereme]['data']['children'][g]['data']['parent_id'] }])], ignore_index = True)
indexy
This gave me this: DataFrame
However, I am not able to obtain the replies to the comments. Any help? I tried to do this
posts = 26
for i in np.arange(0,27):
print('i',i)
if len(list_of_comments[i]['data']['children']) == 0:
continue
for j in np.arange(0,len(list_of_comments[i]['data']['children'])):
if len(list_of_comments[i]['data']['children'][j]['data']['replies']) == 0:
break
else:
print('j',len(list_of_comments[i]['data']['children'][j]['data']['replies']))
for z in np.arange(len(list_of_comments[i]['data']['children'][j]['data']['replies']['data']['children'])):
if len(list_of_comments[i]['data']['children'][j]['data']['replies']['data']['children']) == 0:
break
print('z',z)
print(list_of_comments[i]['data']['children'][j]['data']['replies']['data']['children'][z]['data']['body'])
The first loop kinda works but it doesn't count up properly to get all the replies to all the posts itll only pull like one or two. We don't want to use PRAW

x=len(list_of_comments)
replies = pd.DataFrame()
for i in range(0,len(list_of_comments)):
try:
for j in range(0, len(list_of_comments[x]['data']['children'])):
try:
for z in range(0, len(list_of_comments[x]['data']['children'][j]['data']['replies']['data']['children'])):
#print(list_of_comments[x]['data']['children'][j]['data']['replies']['data']['children'][z]['data']['body'])
#print(list_of_comments[x]['data']['children'][j]['data']['replies']['data']['children'][z]['data']['link_id'])
replies = pd.concat([replies, pd.DataFrame.from_records([{
'body': list_of_comments[x]['data']['children'][j]['data']['replies']['data']['children'][z]['data']['body'],
'post_id': list_of_comments[x]['data']['children'][j]['data']['replies']['data']['children'][z]['data']['link_id']
}])], ignore_index = True)
except:
pass
except:
continue

time_slider_drag_update for TimestampedGeoJson not updating

This is the data
start = eartt1.pivot_table('id',
index = ['place', 'time_hour',
'latitude',
'longitude',
'mag'
],
columns = 'type',
aggfunc='count').reset_index()
start.head()
This is the function to create the features of the animation: time, location, icon
def create_geojson_features(df):
features = []
for _, row in df.iterrows():
feature = {
'type': 'Feature',
'geometry': {
'type':'Point',
'coordinates':[row['longitude'],row['latitude']]
},
'properties': {
'time': pd.to_datetime(row['time_hh'], unit='h').__str__(),
'style': {'color' : ''},
'icon': 'circle',
'iconstyle':{
'fillColor': row['fillcolor'],
'fillOpacity': 0.8,
'stroke': 'true',
'radius': row['mag']*10
}
}
}
features.append(feature)
return features
This initiates the Function
start_geojson = create_geojson_features(start)
start_geojson[0]
start_geojson[0] displays the first date on 1970 that is suspicious as the dataset contains data from 1968
this creates animated map
from folium.plugins import TimestampedGeoJson
EQ_map = folium.Map(location = [2, -2],
tiles = "CartoDB Positron",
zoom_start = 2)
TimestampedGeoJson(start_geojson,
period = 'PT1H',
duration = 'PT1H',
transition_time = 1000,
auto_play = True).add_to(EQ_map)
EQ_map
time_slider_drag_update for TimestampedGeoJson is not updating the different years when it goes through the data points. the data is from 1968 till 2021
Please help ;(
enter image description here
You can find the entire notebook in here
https://nbviewer.org/github/panditadata/Earthquakes/blob/main/theone%20%281%29.ipynb#
or https://panditadata.com/theone_(3).html

How to show specific page views using Google Analytics API?

for an API that I am using, we need to be able to view what specific pages are being clicked on and output to a CSV File. I am able to see the average session duration, and the amount of page views. I am curious as to what I need to add into my code attached below to make sure that this is possible for exporting to a CSV file. Thank you!
from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = 'client_secrets.json'
VIEW_ID ='insert here'
credentials = ServiceAccountCredentials.from_json_keyfile_name(KEY_FILE_LOCATION, SCOPES)
# Build the service object.
analytics = build('analyticsreporting', 'v4', credentials=credentials)
response = analytics.reports().batchGet(body={
'reportRequests': [{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': '30daysAgo', 'endDate': 'today'}],
'metrics': [
{"expression": "ga:pageviews"},
{"expression": "ga:avgSessionDuration"}
], "dimensions": [
{"name": "ga:deviceCategory"}
]
}]}).execute()
response
{'reports': [{'columnHeader': {'dimensions': ['ga:deviceCategory'],
'metricHeader': {'metricHeaderEntries': [{'name': 'ga:pageviews',
'type': 'INTEGER'},
{'name': 'ga:avgSessionDuration', 'type': 'TIME'}]}},
'data': {'isDataGolden': True,
'maximums': [{'values': ['485', '94.95454545454545']}],
'minimums': [{'values': ['29', '51.21186440677966']}],
'rowCount': 3,
'rows': [{'dimensions': ['desktop'],
'metrics': [{'values': ['485', '51.21186440677966']}]},
{'dimensions': ['mobile'],
'metrics': [{'values': ['409', '69.30859375']}]},
{'dimensions': ['tablet'],
'metrics': [{'values': ['29', '94.95454545454545']}]}],
'totals': [{'values': ['923', '60.06487341772152']}]}}]}
import pandas as pd
df = pd.DataFrame(columns=['Name', 'Age'])
def ga_response_dataframe(response):
row_list = []
# Get each collected report
for report in response.get('reports', []):
# Set column headers
column_header = report.get('columnHeader', {})
dimension_headers = column_header.get('dimensions', [])
metric_headers = column_header.get('metricHeader', {}).get('metricHeaderEntries', [])
# Get each row in the report
for row in report.get('data', {}).get('rows', []):
# create dict for each row
row_dict = {}
dimensions = row.get('dimensions', [])
date_range_values = row.get('metrics', [])
# Fill dict with dimension header (key) and dimension value (value)
for header, dimension in zip(dimension_headers, dimensions):
row_dict[header] = dimension
# Fill dict with metric header (key) and metric value (value)
for i, values in enumerate(date_range_values):
for metric, value in zip(metric_headers, values.get('values')):
# Set int as int, float a float
if ',' in value or '.' in value:
row_dict[metric.get('name')] = float(value)
else:
row_dict[metric.get('name')] = int(value)
row_list.append(row_dict)
df = row_list
return df
df = ga_response_dataframe(response)
#df = pd.DataFrame(row_list)
print(df)

Using python to get a track from the spotify API by using search Endpoint

So I'm trying to get a track from the spotify API by searching for it by using the search endpoint of the API (See documentation). First, I authorize myself so I can send GET requests. This happens without issues, I added the code for reproducibility.
import requests
CLIENT_ID = "your_id_here"
CLIENT_SECRET = "your_secret_here"
AUTH_URL = "https://accounts.spotify.com/api/token"
auth_response = requests.post(AUTH_URL, {
'grant_type': 'client_credentials',
'client_id': CLIENT_ID,
'client_secret': CLIENT_SECRET,
})
#Convert response to JSON
auth_response_data = auth_response.json()
#Save the access token
access_token = auth_response_data['access_token']
#Need to pass access token into header to send properly formed GET request to API server
headers = {
'Authorization': 'Bearer {token}'.format(token=access_token)
}
Then, I want to use the search endpoint of the API to find a track by using the track name + artist (I need the track ID later on). When I use the example provided in the documentation, everything works fine and an artist object is returned by using the following query:
BASE_URL = 'https://api.spotify.com/v1/'
r = requests.get(BASE_URL + 'search?q=tania%20bowra&type=artist', headers=headers)
r = r.json()
This is the response, which looks exactly like the one in documentation:
{'artists': {'href': 'https://api.spotify.com/v1/search?query=tania+bowra&type=artist&offset=0&limit=20',
'items': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/08td7MxkoHQkXnWAYD8d6Q'},
'followers': {'href': None, 'total': 235},
'genres': [],
'href': 'https://api.spotify.com/v1/artists/08td7MxkoHQkXnWAYD8d6Q',
'id': '08td7MxkoHQkXnWAYD8d6Q',
'images': [{'height': 640,
'url': 'https://i.scdn.co/image/ab67616d0000b2731ae2bdc1378da1b440e1f610',
'width': 640},
{'height': 300,
'url': 'https://i.scdn.co/image/ab67616d00001e021ae2bdc1378da1b440e1f610',
'width': 300},
{'height': 64,
'url': 'https://i.scdn.co/image/ab67616d000048511ae2bdc1378da1b440e1f610',
'width': 64}],
'name': 'Tania Bowra',
'popularity': 1,
'type': 'artist',
'uri': 'spotify:artist:08td7MxkoHQkXnWAYD8d6Q'}],
'limit': 20,
'next': None,
'offset': 0,
'previous': None,
'total': 1}}
Applying the same logic, I tried to get a track object from the api by using an artist and a track name, like so:
BASE_URL = 'https://api.spotify.com/v1/'
r = requests.get(BASE_URL + 'search?q=artist:queen%20track:bohemian%20rapsody&type=track', headers=headers)
r = r.json()
Even though I do get a valid response (statuscode==200), it seems to be empty:
{'tracks': {'href': 'https://api.spotify.com/v1/search?query=artist%3Aqueen+track%3Abohemian+rapsody&type=track&offset=0&limit=20',
'items': [],
'limit': 20,
'next': None,
'offset': 0,
'previous': None,
'total': 0}}
My question is: why is this happening?

You are now searching for the query: artist:queen%20track:bohemian%20rapsody while this should just be queen%20bohemian%20rapsody instead. the type afterwards shows what items you want to return. You dont have to determine the artist and track name seperately in the query. Interpret the query just like typing something into the spotify search bar.

Problem solved. It was rhapsody instead of rapsody... Sucks be a non-native english speaker sometimes =)

Use of pageToken with Google Analytics Reporting API v4 and Python

I have followed a tutorial on how to download data from Google Analytics with Python using GA Reporting API. I was able to query the data I wanted, although reaching the rows limit.
I saw in the documentation that there is a pageToken to avoid the issue. I have added this field to my request (as describe in the documentation), but I am not able to make it work.
sample_request = {
'viewId': '12345678',
'dateRanges': {
'startDate': datetime.strftime(datetime.now() - timedelta(days = 30),'%Y-%m-%d'),
'endDate': datetime.strftime(datetime.now(),'%Y-%m-%d')
},
'dimensions': [
{'name': 'ga:date'},
{'name': 'ga:dimension7'},
{'name': 'ga:dimension6'},
{'name': 'ga:dimension9'}
],
'metrics': [
{'expression': 'ga:users'},
{'expression': 'ga:totalevents'}
],
"pageSize": 100000,
'pageToken': 'abc'
}
response = api_client.reports().batchGet(
body={
'reportRequests': sample_request
}).execute()

You will hit the limit, but the parameter nextPageToken will allow you to page through multiple rows. For example:
def processReport (self, aDimensions):
"""Get a full report, returning the rows"""
# Get the first set
oReport = self.getReport(aDimensions)
oResponse = self.getResponse(oReport, True)
aRows = oResponse.get('rows')
# Add any additional sets
while oResponse.get('nextPageToken') != None:
oResponse = self.getReport(aDimensions, oResponse.get('nextPageToken'))
oResponse = self.getResponse(oResponse, False)
aRows.extend(oResponse.get('rows'))
return aRows
You can see the complete program here:
https://github.com/aiqui/ga-download

I solved it like this
def handle_report(analytics,pagetoken,rows):
response = get_report(analytics, pagetoken)
columnHeader = response.get("reports")[0].get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
pagetoken = response.get("reports")[0].get('nextPageToken', None)
rowsNew = response.get("reports")[0].get('data', {}).get('rows', [])
rows = rows + rowsNew
print("len(rows): " + str(len(rows)))
if pagetoken != None:
return handle_report(analytics,pagetoken,rows)
else:
return rows
def main():
analytics = initialize_analyticsreporting()
global dfanalytics
dfanalytics = []
rows = []
rows = handle_report(analytics,'0',rows)
dfanalytics = pd.DataFrame(list(rows))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python, sorting extracted youtube data - python

The loop in your code: for i in range(len(videos[:10])): is taking videos from 1 to 10 only (most recent, that's why a 'timeframe'), you have to sort variable videos based on it's viewCount before this.

Related

Python Reddit API JSON issues (no PRAW)

time_slider_drag_update for TimestampedGeoJson not updating

How to show specific page views using Google Analytics API?

Using python to get a track from the spotify API by using search Endpoint

Use of pageToken with Google Analytics Reporting API v4 and Python

Categories

Resources