I am getting following string format from csv file in Pandas
"title = matrix, genre = action, year = 2000, rate = 8"
How can I change the string value into a python dictionary like this:
movie = "title = matrix, genre = action, year = 2000, rate = 8"
movie = {
"title": "matrix",
"genre": "action",
"year": "1964",
"rate":"8"
}
You can split the string and then convert it into a dictionary.
A sample code is given below
movie = "title = matrix, genre = action, year = 2000, rate = 8"
movie = movie.split(",")
# print(movie)
tempMovie = [i.split("=") for i in movie]
movie = {}
for i in tempMovie:
movie[i[0].strip()] = i[1].strip()
print(movie)
For the solution you can use regex
import re
input_user = "title = matrix, genre = action, year = 2000, rate = 8"
# Create a pattern to match the key-value pairs
pattern = re.compile(r"(\w+) = ([\w,]+)" )
# Find all matches in the input string
matches = pattern.findall(input_user)
# Convert the matches to a dictionary
result = {key: value for key, value in matches}
print(result)
The result:
{'title': 'matrix,', 'genre': 'action,', 'year': '2000,', 'rate': '8'}
I hope this can solve your problem.
movie = "title = matrix, genre = action, year = 2000, rate = 8"
dict_all_movies = {}
for idx in df.index:
str_movie = df.at[idx, str_movie_column]
movie_dict = dict(item.split(" = ") for item in str_movie.split(", "))
dict_all_movies[str(idx)] = movie_dict
Related
I'm trying to return only a specific value from the "data" key in this response that I'm currently working with:
{
"dataset": {
"id": 49333506,
"dataset_code": "YMAB",
"database_code": "QOR",
"name": "Y-mAbs Therapeutics Inc. (YMAB) Option Earnings Crush, Liquidity, and Volatility Ratings",
"description": "Option Earnings Crush, Liquidity, and Volatility Ratings for Y-mAbs Therapeutics Inc. (YMAB). All time periods are measured in calendar days. See documentation for methodology.",
"refreshed_at": "2022-08-05 21:20:34 UTC",
"newest_available_date": "2022-08-05",
"oldest_available_date": "2020-02-12",
"column_names": [
"Date",
"EarningsCrushRate",
"CalendarDaysUntilEarnings",
"TradingDaysUntilEarnings",
"LiquidityRating",
"HasLeapOptions",
"HasWeeklyOptions",
"Iv30Rank",
"Iv30Percentile",
"Iv30Rating",
"Iv60Rank",
"Iv60Percentile",
"Iv60Rating",
"Iv90Rank",
"Iv90Percentile",
"Iv90Rating",
"Iv360Rank",
"Iv360Percentile",
"Iv360Rating"
],
"frequency": "daily",
"type": "Time Series",
"premium": true,
"limit": null,
"transform": null,
"column_index": null,
"start_date": "2020-02-12",
"end_date": "2022-08-05",
"data": [
[
"2022-08-05",
null,
null,
null,
2.0,
0.0,
0.0,
0.1437,
0.4286,
0.3706,
0.1686,
0.4762,
0.3936,
0.1379,
0.4502,
0.4129,
0.107,
0.5152,
0.4657
],
I only want to return the date, and a single value at a time from the "data": [ key that's within "dataset": {.
Here's the code I have so far, but am stuck as to make this happen:
r = requests.get(url=f"https://data.nasdaq.com/api/v3/datasets/QOR/{symbol}/data.json?api_key={apikey}")
d = r.json()
dataset = d['dataset_data']
data = dataset['data']
column_names = dataset['column_names']
date = column_names[0]
ercrush = column_names[1]
calendar = column_names[2]
tradingdays = column_names[3]
liquidity = column_names[4]
leaps = column_names[5]
weeklies = column_names[6]
ivrank30 = column_names[7]
ivper30 = column_names[8]
ivrate30 = column_names[9]
ivrank60 = column_names[10]
ivper60 = column_names[11]
ivrate60 =column_names[12]
ivrank90 = column_names[13]
ivper90 = column_names[14]
ivrank90 = column_names[15]
ivrank360= column_names[16]
ivper360 = column_names[17]
ivrank360 = column_names[18]
values = data[0]
For example - I'm only trying to return the Date, defined as column_names[0] paired with the value of "2022-08-05" that's within "data": [ , etc.
How would I go about doing this?
Thanks so much for any help.
I figured out the issue!
I created another variable called results = values and now I can pick the values I want and easily match them with the column_names!
Awesome!
The finished code that works:
r = requests.get(url=f"https://data.nasdaq.com/api/v3/datasets/QOR/{symbol}/data.json?api_key=KyVWdRX_o26L5XNUkgqN")
d = r.json()
dataset = d['dataset_data']
data = dataset['data']
column_names = dataset['column_names']
Date = column_names[0]
ercrush = column_names[1]
calendar = column_names[2]
tradingdays = column_names[3]
liquidity = column_names[4]
leaps = column_names[5]
weeklies = column_names[6]
ivrank30 = column_names[7]
ivper30 = column_names[8]
ivrate30 = column_names[9]
ivrank60 = column_names[10]
ivper60 = column_names[11]
ivrate60 =column_names[12]
ivrank90 = column_names[13]
ivper90 = column_names[14]
ivrank90 = column_names[15]
ivrank360= column_names[16]
ivper360 = column_names[17]
ivrank360 = column_names[18]
values = data[0]
results = values[2] #the correction
print(results)
I want to use the .diff() function on the log_price column in my for loops. What I am after is the old log price value - the new log price value from the df_DC_product data frame. When I try to use .diff() inside the for loops it only returns NaN values. Any thoughts why this might be happening? Thank you for your help.
DC_list = data4['Geography'].drop_duplicates().tolist()
Product_List = data4['Product'].drop_duplicates().tolist()
# create multiple empty lists to store values in:
my_dict = {
"Product" : [],
"Geography" : [],
"Base Dollar Sales": [],
"Base Unit Sales" :[],
"Price Numerator" : [],
"Price Denominator": [],
"Demand Numerator" : [],
"Demand Denominator" : [],
"% Change in Price" : [],
"% Change in Demand": [],
"Price Elasticity of Demand" : []
}
dc_product_ped_with_metrics_all = []
for DC in DC_list:
df_DC = data4.copy()
# # Filtering to the loop's current DC
df_DC = df_DC.loc[(df_DC['Geography'] == DC)]
df_DC = df_DC.copy()
# Making a list of all of the current DC's Product to loop through
Product_list = df_DC['Product'].drop_duplicates().tolist()
for Product in Product_list:
df_DC_product = df_DC.copy()
# # Filtering to the Product
df_DC_product = df_DC_product.loc[(df_DC_product['Product'] == Product)]
df_DC_product = df_DC_product.copy()
# create container:
df_DC_product['pn'] = df_DC_product.iloc[:,5].diff()
df_DC_product['price_d'] = np.divide(df_DC_product.iloc[:,5].cumsum(),2)
df_DC_product['dn'] = df_DC_product.iloc[:,6].diff()
df_DC_product['dd'] = np.divide(df_DC_product.iloc[:,6].cumsum(),2)
df_DC_product['% Change in Demand'] = np.divide(df_DC_product['dn'],df_DC_product['dd'])*100
df_DC_product['% Change in Price'] = np.divide(df_DC_product['pn'],df_DC_product['price_d'])*100
df_DC_product['ped']= np.divide(df_DC_product['% Change in Demand'], df_DC_product['% Change in Price'])
Product = Product,
DC = DC
sales = df_DC_product['Base_Dollar_Sales'].sum()
qty = df_DC_product['Base_Unit_Sales'].sum()
price = df_DC_product['Price'].mean()
log_price = df_DC_product['log_price'].mean()
log_units = df_DC_product['log_units'].sum()
price_numerator = df_DC_product['pn'].mean()
price_denominator = df_DC_product['price_d'].sum()
demand_numerator = df_DC_product['dn'].mean()
demand_denominator = df_DC_product['dd'].sum()
delta_demand = df_DC_product['% Change in Demand'].sum()
delta_price = df_DC_product['% Change in Price'].mean()
ped = df_DC_product['ped'].mean()
dc_product_ped_with_metrics = [
Product,
DC,
sales,
qty,
price,
price_numerator,
price_denominator,
demand_numerator,
demand_denominator,
delta_demand,
delta_price,
ped
]
dc_product_ped_with_metrics_all.append(dc_product_ped_with_metrics)
columns = [
'Product',
'Geography',
'Sales',
'Qty',
'Price',
'Price Numerator',
'Price Denominator',
'Demand Numerator',
'Demand Denominator',
'% Change in Demand',
'% Change in Price',
'Price Elasticity of Demand'
]
dc_product_ped_with_metrics_all = pd.DataFrame(data=dc_product_ped_with_metrics_all, columns=columns)
dc_product_ped_with_metrics_all
.append() doesn't update your dataframe inplace. You need to reassign the dataframe.
for DC in DC_list:
# your code
for Product in Product_list:
# your code
dc_product_ped_with_metrics_all = dc_product_ped_with_metrics_all.append(dc_product_ped_with_metrics)
I have tried using import pandas as pd and am able to produce a csv but it is not how I want it to be. I want it to be saved/printed across the columns rather than down as rows (so job title as the header for column 'A' and all the results listed in the rows etc...
This is my code, what do I need to change to flip the results to appear the other way with all my results.
from requests_html import HTMLSession
import re
import pandas as pd
url = 'https://company.onefootball.com/jobs/#jobs-wrap'
departmentcategories = {
"android": "Software Development",
"social media": "Marketing",
"content ": "Marketing",
"sales": "Sales",
}
languagecategories = {
" and ": "English",
" und ": "German",
}
experiencecategories = {
"senior": "Mid Senior Level",
"Junior": "Entry Level",
}
s = HTMLSession()
r = s.get(url)
r.html.render(sleep=1)
jobs = r.html.xpath('//*[#id="jobs-wrap"]', first=True)
def get_department_categories(department):
depcats = []
for k, v in departmentcategories.items():
if re.search(k, department, re.IGNORECASE):
depcats.append(v)
return depcats
def get_language_categories(language):
langcats = []
for k, v in languagecategories.items():
if re.search(k, language, re.IGNORECASE):
langcats.append(v)
return langcats
def get_experience_categories(experience):
expcats = []
for k, v in experiencecategories.items():
if re.search(k, experience, re.IGNORECASE):
expcats.append(v)
return expcats
for item in jobs.absolute_links:
r = s.get(item)
job_title = r.html.find('h1.headline', first=True).text
city = r.html.find('p.h6', first=True).text
if city == ('Berlin, Germany'):
city = 'Berlin'
country = r.html.find('p.h6', first=True).text
if country == ('Berlin, Germany'):
country = 'Germany'
#Section for the department, languages, and experience level
department = r.html.find('div.job-content--parsed', first=True).text
department_cats = get_department_categories(department)
language = r.html.xpath('//*[#id="job"]/div[1]/div[2]', first=True).text
language_cats = get_language_categories(language)
experience = r.html.find('div.job-content--parsed', first=True).text
experience_cats = get_experience_categories(experience)
joblist = [job_title, city, country, "OneFootball", ", ".join(department_cats), ", ".join(experience_cats), ", ".join(language_cats), "Sport", item]
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('newtest.csv')
I am implementing an emotion analysis using lstm method where I have already done my training model as well as my prediction part. but my prediction is appearing in one column.. I will show you below.
Here are my codes:
with open('output1.json', 'w') as f:
json.dump(new_data, f)
selection1 = new_data['selection1']
#creating empty list to be able to create a dataframe
names = []
dates = []
commentss = []
labels = []
hotelname = []
for item in selection1:
name = item['name']
hotelname.append(name)
#print ('>>>>>>>>>>>>>>>>>> ', name)
Date = item['reviews']
for d in Date:
names.append(name)
#convert date from 'january 12, 2020' to 2020-01-02
date = pd.to_datetime(d['date']).strftime("%Y-%m-%d")
#adding date to the empty list dates[]
dates.append(date)
#print('>>>>>>>>>>>>>>>>>> ', date)
CommentID = item['reviews']
for com in CommentID:
comment = com['review']
lcomment = comment.lower() # converting all to lowercase
result = re.sub(r'\d+', '', lcomment) # remove numbers
results = (result.translate(
str.maketrans('', '', string.punctuation))).strip() # remove punctuations and white spaces
comments = remove_stopwords(results)
commentss.append(comment)
# print('>>>>>>',comments)
#add the words in comments that are already present in the keys of dictionary
encoded_samples = [[word2id[word] for word in comments if word in word2id.keys()]]
# Padding
encoded_samples = keras.preprocessing.sequence.pad_sequences(encoded_samples, maxlen=max_words)
# Make predictions
label_probs, attentions = model_with_attentions.predict(encoded_samples)
label_probs = {id2label[_id]: prob for (label, _id), prob in zip(label2id.items(), label_probs[0])}
labels.append(label_probs)
#creating dataframe
dataframe={'name': names,'date': dates, 'comment': commentss, 'classification': labels}
table = pd.DataFrame(dataframe, columns=['name', 'date', 'comment', 'classification'])
json = table.to_json('hotel.json', orient='records')
here is the results i obtain:
[
{
"name": "Radisson Blu Azuri Resort & Spa",
"date": "February 02, 2020",
"comment": [
"enjoy",
"daily",
"package",
"start",
"welcoming",
"end",
"recommend",
"hotel"
],
"label": {
"joy": 0.0791392997,
"surprise": 0.0002606699,
"love": 0.4324670732,
"sadness": 0.2866959572,
"fear": 0.0002588668,
"anger": 0.2011781186
}
},
you can find the complete output on this link: https://jsonblob.com/a9b4035c-5576-11ea-afe8-1d95b3a2e3fd
Is it possible to break the label field into separate fields like below??
[
{
"name": "Radisson Blu Azuri Resort & Spa",
"date": "February 02, 2020",
"comment": [
"enjoy",
"daily",
"package",
"start",
"welcoming",
"end",
"recommend",
"hotel"
],
"joy": 0.0791392997,
"surprise": 0.0002606699,
"love": 0.4324670732,
"sadness": 0.2866959572,
"fear": 0.0002588668,
"anger": 0.2011781186
},
Can someone please help me how do i need to modify my codes and make this possible please guys explain to me please..
If you can't do it before you produce the result, you can easily manipulate that dictionary like so:
def move_labels_to_dict_root(result):
labels = result["labels"]
meta_data = result
del meta_data["labels"]
result = {**meta_data, **labels}
return result
and then call move_labels_to_dict_root in a list comprehension like [move_labels_to_dict_root(result) for result in results].
However, I would ask why you want to do this?
I have a JSON data as:
{
"persons": [
{
"city": "Seattle",
"name": "Brian"
"dob" : "19-03-1980"
},
{
"city": "Amsterdam",
"name": "David"
"dob" : "19-09-1979"
}
{
"city": "London",
"name": "Joe"
"dob" : "19-01-1980"
}
{
"city": "Kathmandu",
"name": "Brian"
"dob" : "19-03-1980"
}
]
}
How can I count the individual elements, like, number of person born in Jan-Dec (0 if none were born) and born in given year using python in one single iteration. Also the number of unique names registered in each month
Like:
1980 :3
--Jan:1
--Mar:2
1979 :1
--Sep:1
Names:
Mar 1980: 1 #Brian is same for both cities
Jan 1980: 1
Sep 1979: 1
counters_mon is the counter that has values for specific months of year
for k_mon,v_mon in counters_mon.items():
print('{}={}'.format(k_mon,v_mon))
But I want details too to be printed. How can I achieve this?
import json
f = open('/path/to/your/json', 'r')
persons = json.load(f)
years_months = {}
years_months_names = {}
for person in persons['persons']:
year = person['dob'][-4:]
month = person['dob'][3:5]
month_year = month + ' ' + year
name = person['name']
if year not in years_months.keys():
years_months[year] = { 'count': 1, 'months' : {} }
if month not in years_months[year]['months'].keys():
years_months[year]['months'][month] = 1
else:
years_months[year]['months'][month] += 1
else:
years_months[year]['count'] += 1
if month not in years_months[year]['months'].keys():
years_months[year]['months'][month] = 1
else:
years_months[year]['months'][month] += 1
if month_year not in years_months_names.keys():
years_months_names[month_year] = set([name])
else:
years_months_names[month_year].add(name)
for k, v in years_months.items():
print(k + ': ' + str(v['count']))
for month, count in v['months'].items():
print("-- " + str(month) + ": " + str(count))
for k, v in years_months_names.items():
print(k + ": " + str(len(v)))
I'm assuming that you have the path to your json. I also tested my answer on the JSON that you've posted, and be careful to make sure that your JSON is structured correctly.
This is a good case for using defaultdicts (https://docs.python.org/3/library/collections.html#collections.defaultdict).
data # assume you have your data in a var called data
from collections import defaultdict
from calendar import month_abbr
# slightly strange construction here but we want a 2 levels of defaultdict followed by lists
aggregate = defaultdict(lambda:defaultdict(list))
# then the population is super simple - you'll end up with something like
# aggregate[year][month] = [name1, name2]
for person in data['persons']:
day, month, year = map(int, person['dob'].split('-'))
aggregate[year][month].append(person['name'])
# I'm sorting in chronological order for printing
for year, months in sorted(aggregate.items()):
print('{}: {}'.format(year, sum(len(names) for names in months.values())))
for month, names in sorted(months.items()):
print('--{}: {}'.format(month_abbr[month], len(names)))
for year, months in sorted(aggregate.items()):
for month, names in sorted(months.items()):
print('{} {}: {}'.format(month_abbr[month], year, len(set(names))))
Depending on how the data was going to be used I'd actually consider not having the complex nesting in the aggregation and instead opt for something like aggregate[(year, month)] = [name1, name2,...]. I find that the more nested my data, the more confusing it is to work with.
EDIT Alternatively you can create several structures on the first pass so the printing step is simplified. Again, I'm using defaultdict to clean up all the provisioning.
agg_years = defaultdict(lambda:defaultdict(int)) # [year][month] = counter
agg_years_total = defaultdict(int) # [year] = counter
agg_months_names = defaultdict(set) # [(year, month)] = set(name1, name2...)
for person in data['persons']:
day, month, year = map(int, person['dob'].split('-'))
agg_years[year][month] += 1
agg_years_total[year] += 1
agg_months_names[(year, month)].add(person['name'])
for year, months in sorted(agg_years.items()):
print('{}: {}'.format(year, agg_years_total[year]))
for month, quant in sorted(months.items()):
print('--{}: {}'.format(month_abbr[month], quant))
for (year, month), names in sorted(agg_months_names.items()):
print('{} {}: {}'.format(month_abbr[month], year, len(names)))