I'm trying to create a dataset of all my saved tracks on spotify with its metadata. I've gotten all of the song features, the track name and track id. I want to add a column of the track's artist and one of the genre.
I tried adding it by the "liked_tracks.extend" but i couldn't get it to work.
cid =""
secret = ""
redirect_uri = 'http://localhost:8000/callback'
FEATURE_KEYS = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
OFFSET=0
SAVED_TRACKS_LIMIT=50
FEATURE_LIMIT = 100
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=cid,
client_secret=secret,
redirect_uri=redirect_uri,
scope="user-library-read"))
liked_tracks = list()
print(liked_tracks)
while(True):
paged_tracks = sp.current_user_saved_tracks(offset=OFFSET, limit=SAVED_TRACKS_LIMIT)
liked_tracks.extend([{'name':el['track']['name'],
'id':el['track']['id']} for el in paged_tracks['items']])
print(f'Fetched {len(liked_tracks)} tracks')
OFFSET+=SAVED_TRACKS_LIMIT
if paged_tracks['next'] is None:
break
def get_windowed_track_ids(liked_tracks, limit):
for i in range(0, len(liked_tracks), limit):
track_window = liked_tracks[i:i + limit]
yield track_window, [t['id'] for t in track_window]
track_feature_list = list()
print('')
for track_window, track_window_ids in get_windowed_track_ids(liked_tracks, FEATURE_LIMIT):
track_features = sp.audio_features(tracks=track_window_ids)
for index, _track in enumerate(track_window):
_track.update({k:v for k,v in track_features[index].items() if k in FEATURE_KEYS})
track_feature_list.append(_track)
print(f'Fetched features for {len(track_feature_list)} tracks')
df = pd.DataFrame.from_dict(track_feature_list)
mysavedsongs = f'liked_tracks_{int(time())}.csv'
df.to_csv(mysavedsongs, index=False)
print('')
print(f'Saved features to {mysavedsongs}')
Related
I have a python code which works for doing data analytics from csv file. I want to run my python code to be run periodically on a docker container. Every 15 seconds, it should automatically look at a folder A, if there is a csv file in it, it should process it and put an html report with the same name in folder B.
HERE IS MY PYTHON CODE .
#This program pulls data from csv file and displays it as html file.
#csv file contains device names, card names and temperatures of cards
#The html file contains: how many devices, how many cards are in the system, which
#device has the highest temperature card, and in the table below is how many cards are
#there in summary for each device, how many have a temperature of 70 and above, the
#highest and average card what are the temperatures
#NOTE: The print functions in the program are written for trial purposes.
from enum import unique
from re import A, T
import pandas as pd
from prettytable import PrettyTable, PLAIN_COLUMNS
table = PrettyTable() #create a table for device
table2 = PrettyTable() #create a table for summary
table.field_names = ["Device -", "Total # of Cards - ", "High Temp. Cards # - ", "Max Temperature - ", "Avg. Temperature "]
table2.field_names = [" "," "]
df = pd.read_csv("cards.csv", sep=';', usecols = ['Device','Card','Temperature'])""", index_col=["Device","Card"]"""
print(type(df))
print(df["Device"].nunique(),"\n\n") # number of unique server
total_devices = df["Device"].nunique() # NUMBER OF DEVICES IN DIFFERENT TYPES
print(total_devices)
print(df["Device"].loc[1],"\n\n")
print(df['Temperature'].max(),"\n\n")
maxTemp = df['Temperature'].max() #finding max temperature
print("total card ", )
i= 0
j=1
#Finding the card with the max temperature and the server where the card is located
while j > 0:
if df["Temperature"].loc[i] == df["Temperature"].max():
print(df["Device"].loc[i])
print(df["Card"].loc[i])
deviceName = df["Device"].loc[i]
cardName = df["Card"].loc[i]
j= 0
else :
i = i+1
dev_types = df["Device"].unique() # Server's names
print("\n\n")
newstr = cardName + "/" + deviceName
#Summary tablosunu olusturma
table2.add_row(["Total Devices ", total_devices] )
table2.add_row(["Total Cads ", len(df["Card"])])
table2.add_row(["Max Card Temperature ", df["Temperature"].max()])
table2.add_row(["Hottest Card / Device " ,newstr])
print(table2)
row_num = len(df)
print(row_num)
#I pulled the data from the file according to the device type so that the server cards and temperatures were sorted, I found the max temp from here
dn = pd.read_csv("cards.csv", sep=';', index_col=["Device"], usecols = ['Device','Card','Temperature'])
sum = []
high = []
#print("max temp: ", dn["Temperature"].loc[dev_types[1]].max())
for x in range(total_devices): # total devices (according the file = 3 )
print("\n")
cardCount = 0 # counts the number of cards belonging to the device
count2 = 0 # Counts the number of cards with a temperature greater than 70
tempcount = 0
print(dev_types[x])
for y in range(row_num):
if dev_types[x] == df["Device"].loc[y]:
print(df["Temperature"].loc[y])
tempcount = tempcount + df["Temperature"].loc[y] # the sum of the temperatures of the cards(used when calculating the average)
cardCount = cardCount +1
if df["Temperature"].loc[y] >= 70:
count2 = count2 +1
maxT = dn["Temperature"].loc[dev_types[x]].max() #Finding the ones with the max temperature from the cards belonging to the server
avg = str(tempcount/cardCount)
print("avg",avg)
table.add_row([dev_types[x], cardCount, count2, maxT,avg ]) # I added the information to the "devices" table
print("num of cards" , cardCount)
print("high temp cards" , count2)
print("\n\n")
print("\n\n")
print(table)
htmlCode = table.get_html_string()
htmlCode2 = table2.get_html_string()
f= open('devices.html', 'w')
f.write("SUMMARY")
f.write(htmlCode2)
f.write("DEVICES")
f.write(htmlCode)
Whether or not the code is run in Docker doesn't matter.
Wrap all of that current logic (well, not the imports and so on) in a function, say, def process_cards().
Call that function forever, in a loop:
import logging
def process_cards():
table = PrettyTable()
...
def main():
logging.basicConfig()
while True:
try:
process_cards()
except Exception:
logging.exception("Failed processing")
time.sleep(15)
if __name__ == "__main__":
main()
As an aside, your data processing code can be vastly simplified:
import pandas as pd
from prettytable import PrettyTable
def get_summary_table(df):
summary_table = PrettyTable() # create a table for summary
total_devices = df["Device"].nunique()
hottest_card = df.loc[df["Temperature"].idxmax()]
hottest_device_desc = f"{hottest_card.Card}/{hottest_card.Device}"
summary_table.add_row(["Total Devices", total_devices])
summary_table.add_row(["Total Cards", len(df["Card"])])
summary_table.add_row(["Max Card Temperature", df["Temperature"].max()])
summary_table.add_row(["Hottest Card / Device ", hottest_device_desc])
return summary_table
def get_devices_table(df):
devices_table = PrettyTable(
[
"Device",
"Total # of Cards",
"High Temp. Cards #",
"Max Temperature",
"Avg. Temperature",
]
)
for device_name, group in df.groupby("Device"):
count = len(group)
avg_temp = group["Temperature"].mean()
max_temp = group["Temperature"].max()
high_count = group[group.Temperature >= 70]["Temperature"].count()
print(f"{device_name=} {avg_temp=} {max_temp=} {high_count=}")
devices_table.add_row([device_name, count, high_count, max_temp, avg_temp])
return devices_table
def do_processing(csv_file="cards.csv", html_file="devices.html"):
# df = pd.read_csv(csv_file, sep=';', usecols=['Device', 'Card', 'Temperature'])
# (Just some random example data)
df = pd.DataFrame({
"Device": [f"Device {1 + x // 3}" for x in range(10)],
"Card": [f"Card {x + 1}" for x in range(10)],
"Temperature": [59.3, 77.2, 48.5, 60.1, 77.2, 61.1, 77.4, 65.8, 71.2, 60.3],
})
summary_table = get_summary_table(df)
devices_table = get_devices_table(df)
with open(html_file, "w") as f:
f.write(
"<style>table, th, td {border: 1px solid black; border-collapse: collapse;}</style>"
)
f.write("SUMMARY")
f.write(summary_table.get_html_string(header=False))
f.write("DEVICES")
f.write(devices_table.get_html_string())
do_processing()
i have an example of repeat decorator for run your function every seconds or minutes ...
i hope this sample helps you
from typing import Optional, Callable, Awaitable
import asyncio
from functools import wraps
def repeat_every(*, seconds: float, wait_first: bool = False)-> Callable:
def decorator(function: Callable[[], Optional[Awaitable[None]]]):
is_coroutine = asyncio.iscoroutinefunction(function)
#wraps(function)
async def wrapped():
async def loop():
if wait_first:
await asyncio.sleep(seconds)
while True:
try:
if is_coroutine:
await function()
else:
await asyncio.run_in_threadpool(function)
except Exception as e:
raise e
await asyncio.sleep(seconds)
asyncio.create_task(loop())
return wrapped
print("Repeat every working well.")
return decorator
#repeat_every(seconds=2)
async def main():
print(2*2)
try:
loop = asyncio.get_running_loop()
except RuntimeError:
loop = None
if loop and loop.is_running():
print('Async event loop already running.')
tsk = loop.create_task(main())
tsk.add_done_callback(
lambda t: print(f'Task done with result= {t.result()}'))
else:
print('Starting new event loop')
asyncio.run(main())
and there is an option that you can make an entrypoint which has cronjob
I am using PRAW to get data from Reddit and created this function to do so on multiple subreddits.
It works, however, I am working on a more concise/pythonic version but can't figure out how I can create a single "for loop", doing the job of the 3 below.
subs = r.subreddit('Futurology+wallstreetbets+DataIsBeautiful+RenewableEnergy+Bitcoin')
#This function aim to scrap data from a list of subreddit.
#From these subreddit, I would like to get the #new, #hot and #rising posts
def get_data(size_new, size_hot, size_rising, subs_number):
posts = []
followers = []
targeted_date = '14-11-20 12:00:00'
targeted_date = datetime.datetime.strptime(targeted_date, '%d-%m-%y %H:%M:%S')
#getting x new posts
for subreddit in subs.new(limit = size_new):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting x hot posts
for subreddit in subs.hot(limit = size_hot):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting x rising posts
for subreddit in subs.rising(limit = size_rising):
date = subreddit.created
date = datetime.datetime.fromtimestamp(date)
if date >= targeted_date:
posts.append([date, subreddit.subreddit, subreddit.title, subreddit.selftext])
#getting subreddit subscribers number
for sub_name in subs_2:
for submission in r.subreddit(sub_name).hot(limit = 1):
followers.append([submission.subreddit, r.subreddit(sub_name).subscribers])
#creating 2 df
df_1 = pd.DataFrame(followers, columns = ['subreddit','subscribers'])
df = pd.DataFrame(posts, columns = ['date', 'subreddit', 'title', 'text']).drop_duplicates().sort_values(by = ['date']).reset_index(drop = True)
#concat the 2 df together
df = df.join(df_1.set_index('subreddit'), on = 'subreddit')
df = df[["date", "subreddit", "subscribers", "title", 'text']]
df = df[df.subscribers > subs_number].reset_index(drop = True)
return df
My request: how could it be more concise/optimized? What methodology are you using to make your code more readable or even better, optimize it for run time/computational resources?
Thank you
There are various principles to make better code, and various tools to use to find the 'code smells' that may be lurking in your code.
DRY - Don't Repeat Yourself
KISS - keep it stupid simple
SOLID
etc...
Taking a dive into the code that you posted using some of the principles on a surface level would refactor some of your code into looking like:
subs = r.subreddit('Futurology+wallstreetbets+DataIsBeautiful+RenewableEnergy+Bitcoin')
# check that the date is greater than the target date
# return true/false
def check_date(subreddit, targeted_date):
return subreddit.created >= targeted_date:
# get specific post data
def get_post_data(subreddit):
return [subreddit.created, subreddit.subreddit, subreddit.title, subreddit.selftext]
# get posts by sort type
def get_subreddit_post_types(subreddit_sort, targeted_date):
return [get_post_data(subreddit) for subreddit in subreddit_sort if check_date(subreddit, targeted_date)]
#This function aim to scrap data from a list of subreddit.
#From these subreddit, I would like to get the #new, #hot and #rising posts
def get_data(size_new, size_hot, size_rising, subs_number):
targeted_date = '14-11-20 12:00:00'
targeted_date = datetime.datetime.strptime(targeted_date, '%d-%m-%y %H:%M:%S').timestamp()
posts = []
followers = []
#getting x new posts
posts.extend(get_subreddit_post_types(subs.new(limit = size_new), targeted_date))
#getting x hot posts
posts.extend(get_subreddit_post_types(subs.hot(limit = size_hot), targeted_date))
#getting x rising posts
posts.extend(get_subreddit_post_types(subs.rising(limit = size_rising), targeted_date))
#getting subreddit subscribers number
for sub_name in subs_2:
for submission in r.subreddit(sub_name).hot(limit = 1):
followers.append([submission.subreddit, r.subreddit(sub_name).subscribers])
#creating 2 df
df_1 = pd.DataFrame(followers, columns = ['subreddit','subscribers'])
df = pd.DataFrame(posts, columns = ['date', 'subreddit', 'title', 'text']).drop_duplicates().sort_values(by = ['date']).reset_index(drop = True)
#concat the 2 df together
df = df.join(df_1.set_index('subreddit'), on = 'subreddit')
df = df[["date", "subreddit", "subscribers", "title", 'text']]
df = df[df.subscribers > subs_number].reset_index(drop = True)
return df
As for better optimizing your computational resources (what are you trying to optimize memory or runtime)? The same process applies to either is to look at your code to see what can be changed to decrease one versus the other.
From looking at your code something that would generally optimize what you wrote would be to look at what are the 'duplicate' posts that you are getting. If you could remove the duplicate check (as each of the hot/rising/new get posts from similar date ranges, but hot/rising may be completely encompassed inside of new) call from the posts that you gathered, so that you don't have to check that they are different, and possibly remove hot/rising calls (because those posts may be encompassed in new).
This code returns a JSON datas in my localhost from pokeAPI (https://pokeapi.co/) already filtered. But everytime I reload the page my dictionary gets duplicated and I just want the original always. PS: the red mark on the picture shows the original dictionary and the rest of it is wrong.
#listas auxiliares
tipo = [] #aux tipo
habil = [] #aux habilidade
lista_hab = []
lista_tipo = []
lista_todos = []
lista_aux = []
#dicionĂ¡rios
dicio_stat = {}
dicio_todos = {}
dicio_aux = {}
def pokemons(request):
for i in range(1, 10):
url = f"https://pokeapi.co/api/v2/pokemon/{i}"
requisicao = requests.get(url)
try:
lista = requisicao.json()
except ValueError:
print("ERRO TIPO")
dicio = {
'ID': lista['id'],
'Nome': lista['name'],
'Tipo': lista_tipo,
'Peso': lista['weight'],
'Altura': lista['height'],
'Habilidades': lista_hab,
'Estatisticas': dicio_stat,
'Link_img': lista['sprites']['other']['official-artwork']['front_default']
}
for a in lista['abilities']:
#dic_abi[i['ability']['name']] = i['ability']['url']
habil.append(a['ability']['name'])
dicio['Habilidades'] = habil[:]
lista_hab.append(dicio.copy())
for s in lista['stats']:
dicio_stat[s['stat']['name']] = s['base_stat']
for t in lista['types']:
#dic_type[i['type']['name']] = i['type']['url']
tipo.append(t['type']['name'])
dicio['Tipo'] = tipo[:]
lista_tipo.append(dicio.copy())
dicio_aux = dicio.copy()
lista_aux.append(dicio_aux)
dicio_todos['pokemons'] = lista_aux
habil.clear()
tipo.clear()
dicio.clear()
return JsonResponse(dicio_todos)
JSON
Problem: I am trying to extract data through an API Service. A single request can take anywhere from 3 to 10 seconds. There are roughly 20,000 rows of data from a Pandas DataFrame to input into the API Call. I have managed to speed it up a bit through multiprocessing, but it's still running very slow. Any suggestions?
Code:
def scored_card_features2(source, n_batches):
"""Multiprocessing version of Scored Card Features Function
Returns reason for rating
"""
# read in source data and convert to list of lists for inputs
data = pd.read_excel(source)
data = data[['primary_bank_report_id', 'primary_tu_credit_report_id', 'purpose']]
inputs = data.values.tolist()
def scored_card_map(i):
"""form request to scored card service and retrieve values"""
url = "url/FourthGen?bank_report_id=%s&credit_report_id=%s&" \
"&loan_purpose=%s" % (i[0], i[1], i[2].replace(" ", "%20"))
r = requests.get(url)
try:
d = json.loads(r.text)
l = [d['probability_of_default'],
d['condition'],
d['purpose_of_loan'],
d['rating'],
d['bank_report_id'],
d['reason_for_rating'],
d['credit_report_id']]
return l
except:
l = [np.nan] * 7
return l
# inititate multithreading
with Pool(n_batches) as p:
vals = p.map(scored_card_map, inputs)
result = pd.DataFrame(vals, columns=['Probability of Default', 'Condition', 'Purpose of Loan', 'Rating', 'Bank Report ID',
'Reason for Rating', 'Credit Report ID'])
result = result.dropna(how='all')
return result
if __name__ == '__main__':
# model features
start = time.time()
df = scored_card_features2('BankCreditPortalIDsPurpose.xlsx', multiprocessing.cpu_count()-1)
df.to_csv('scored_card_features.csv', index=False)
end = time.time()
print(end-start)
Is there a way using the python api to set an upper limit to the number of documents that are retrieved if we scroll in chunks of a specific size. So let's say I want a maximum of 100K documents being scrolled in chunks of 2K, where there are over 10Mil documents available.
I've implemented a counter like object but I want to know if there is a more natural solution.
es_query = {"query": {"function_score": {"functions": [{"random_score": {"seed": "1234"}}]}}}
es = Elasticsearch(ADDRESS, port=PORT)
result = es.search(
index="INDEX",
doc_type="DOC_TYPE",
body=es_query,
size=2000,
scroll="1m")
data = []
for hit in result["hits"]["hits"]:
for d in hit["_source"]["attributes"]["data_of_interest"]:
data.append(d)
do_something(*args)
scroll_id = result['_scroll_id']
scroll_size = result["hits"]["total"]
i = 0
while(scroll_size>0):
if i % 10000 == 0:
print("Scrolling ({})...".format(i))
result = es.scroll(scroll_id=scroll_id, scroll="1m")
scroll_id = result["_scroll_id"]
scroll_size = len(result['hits']['hits'])
data = []
for hit in result["hits"]["hits"]:
for d in hit["_source"]["attributes"]["data_of_interest"]:
data.append(d)
do_something(*args)
i += 1
if i == 100000:
break
To me if you only want the first 100K you should narrow your query in the first place. That wills speed up your process. You can add a filter on date for example.
Regarding the code I do not know other way than using the counter. I would just correct the indentation and remove the if statement for readability.
es_query = {"query": {"function_score": {"functions": [{"random_score": {"seed": "1234"}}]}}}
es = Elasticsearch(ADDRESS, port=PORT)
result = es.search(
index="INDEX",
doc_type="DOC_TYPE",
body=es_query,
size=2000,
scroll="1m")
data = []
for hit in result["hits"]["hits"]:
for d in hit["_source"]["attributes"]["data_of_interest"]:
data.append(d)
do_something(*args)
scroll_id = result['_scroll_id']
scroll_size = result["hits"]["total"]
i = 0
while(scroll_size > 0 & i < 100000):
print("Scrolling ({})...".format(i))
result = es.scroll(scroll_id=scroll_id, scroll="1m")
scroll_id = result["_scroll_id"]
scroll_size = len(result['hits']['hits'])
# data = [] why redefining the list ?
for hit in result["hits"]["hits"]:
for d in hit["_source"]["attributes"]["data_of_interest"]:
data.append(d)
do_something(*args)
i ++