I have tried using import pandas as pd and am able to produce a csv but it is not how I want it to be. I want it to be saved/printed across the columns rather than down as rows (so job title as the header for column 'A' and all the results listed in the rows etc...
This is my code, what do I need to change to flip the results to appear the other way with all my results.
from requests_html import HTMLSession
import re
import pandas as pd
url = 'https://company.onefootball.com/jobs/#jobs-wrap'
departmentcategories = {
"android": "Software Development",
"social media": "Marketing",
"content ": "Marketing",
"sales": "Sales",
}
languagecategories = {
" and ": "English",
" und ": "German",
}
experiencecategories = {
"senior": "Mid Senior Level",
"Junior": "Entry Level",
}
s = HTMLSession()
r = s.get(url)
r.html.render(sleep=1)
jobs = r.html.xpath('//*[#id="jobs-wrap"]', first=True)
def get_department_categories(department):
depcats = []
for k, v in departmentcategories.items():
if re.search(k, department, re.IGNORECASE):
depcats.append(v)
return depcats
def get_language_categories(language):
langcats = []
for k, v in languagecategories.items():
if re.search(k, language, re.IGNORECASE):
langcats.append(v)
return langcats
def get_experience_categories(experience):
expcats = []
for k, v in experiencecategories.items():
if re.search(k, experience, re.IGNORECASE):
expcats.append(v)
return expcats
for item in jobs.absolute_links:
r = s.get(item)
job_title = r.html.find('h1.headline', first=True).text
city = r.html.find('p.h6', first=True).text
if city == ('Berlin, Germany'):
city = 'Berlin'
country = r.html.find('p.h6', first=True).text
if country == ('Berlin, Germany'):
country = 'Germany'
#Section for the department, languages, and experience level
department = r.html.find('div.job-content--parsed', first=True).text
department_cats = get_department_categories(department)
language = r.html.xpath('//*[#id="job"]/div[1]/div[2]', first=True).text
language_cats = get_language_categories(language)
experience = r.html.find('div.job-content--parsed', first=True).text
experience_cats = get_experience_categories(experience)
joblist = [job_title, city, country, "OneFootball", ", ".join(department_cats), ", ".join(experience_cats), ", ".join(language_cats), "Sport", item]
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('newtest.csv')
Related
I am getting following string format from csv file in Pandas
"title = matrix, genre = action, year = 2000, rate = 8"
How can I change the string value into a python dictionary like this:
movie = "title = matrix, genre = action, year = 2000, rate = 8"
movie = {
"title": "matrix",
"genre": "action",
"year": "1964",
"rate":"8"
}
You can split the string and then convert it into a dictionary.
A sample code is given below
movie = "title = matrix, genre = action, year = 2000, rate = 8"
movie = movie.split(",")
# print(movie)
tempMovie = [i.split("=") for i in movie]
movie = {}
for i in tempMovie:
movie[i[0].strip()] = i[1].strip()
print(movie)
For the solution you can use regex
import re
input_user = "title = matrix, genre = action, year = 2000, rate = 8"
# Create a pattern to match the key-value pairs
pattern = re.compile(r"(\w+) = ([\w,]+)" )
# Find all matches in the input string
matches = pattern.findall(input_user)
# Convert the matches to a dictionary
result = {key: value for key, value in matches}
print(result)
The result:
{'title': 'matrix,', 'genre': 'action,', 'year': '2000,', 'rate': '8'}
I hope this can solve your problem.
movie = "title = matrix, genre = action, year = 2000, rate = 8"
dict_all_movies = {}
for idx in df.index:
str_movie = df.at[idx, str_movie_column]
movie_dict = dict(item.split(" = ") for item in str_movie.split(", "))
dict_all_movies[str(idx)] = movie_dict
I'm trying to return only a specific value from the "data" key in this response that I'm currently working with:
{
"dataset": {
"id": 49333506,
"dataset_code": "YMAB",
"database_code": "QOR",
"name": "Y-mAbs Therapeutics Inc. (YMAB) Option Earnings Crush, Liquidity, and Volatility Ratings",
"description": "Option Earnings Crush, Liquidity, and Volatility Ratings for Y-mAbs Therapeutics Inc. (YMAB). All time periods are measured in calendar days. See documentation for methodology.",
"refreshed_at": "2022-08-05 21:20:34 UTC",
"newest_available_date": "2022-08-05",
"oldest_available_date": "2020-02-12",
"column_names": [
"Date",
"EarningsCrushRate",
"CalendarDaysUntilEarnings",
"TradingDaysUntilEarnings",
"LiquidityRating",
"HasLeapOptions",
"HasWeeklyOptions",
"Iv30Rank",
"Iv30Percentile",
"Iv30Rating",
"Iv60Rank",
"Iv60Percentile",
"Iv60Rating",
"Iv90Rank",
"Iv90Percentile",
"Iv90Rating",
"Iv360Rank",
"Iv360Percentile",
"Iv360Rating"
],
"frequency": "daily",
"type": "Time Series",
"premium": true,
"limit": null,
"transform": null,
"column_index": null,
"start_date": "2020-02-12",
"end_date": "2022-08-05",
"data": [
[
"2022-08-05",
null,
null,
null,
2.0,
0.0,
0.0,
0.1437,
0.4286,
0.3706,
0.1686,
0.4762,
0.3936,
0.1379,
0.4502,
0.4129,
0.107,
0.5152,
0.4657
],
I only want to return the date, and a single value at a time from the "data": [ key that's within "dataset": {.
Here's the code I have so far, but am stuck as to make this happen:
r = requests.get(url=f"https://data.nasdaq.com/api/v3/datasets/QOR/{symbol}/data.json?api_key={apikey}")
d = r.json()
dataset = d['dataset_data']
data = dataset['data']
column_names = dataset['column_names']
date = column_names[0]
ercrush = column_names[1]
calendar = column_names[2]
tradingdays = column_names[3]
liquidity = column_names[4]
leaps = column_names[5]
weeklies = column_names[6]
ivrank30 = column_names[7]
ivper30 = column_names[8]
ivrate30 = column_names[9]
ivrank60 = column_names[10]
ivper60 = column_names[11]
ivrate60 =column_names[12]
ivrank90 = column_names[13]
ivper90 = column_names[14]
ivrank90 = column_names[15]
ivrank360= column_names[16]
ivper360 = column_names[17]
ivrank360 = column_names[18]
values = data[0]
For example - I'm only trying to return the Date, defined as column_names[0] paired with the value of "2022-08-05" that's within "data": [ , etc.
How would I go about doing this?
Thanks so much for any help.
I figured out the issue!
I created another variable called results = values and now I can pick the values I want and easily match them with the column_names!
Awesome!
The finished code that works:
r = requests.get(url=f"https://data.nasdaq.com/api/v3/datasets/QOR/{symbol}/data.json?api_key=KyVWdRX_o26L5XNUkgqN")
d = r.json()
dataset = d['dataset_data']
data = dataset['data']
column_names = dataset['column_names']
Date = column_names[0]
ercrush = column_names[1]
calendar = column_names[2]
tradingdays = column_names[3]
liquidity = column_names[4]
leaps = column_names[5]
weeklies = column_names[6]
ivrank30 = column_names[7]
ivper30 = column_names[8]
ivrate30 = column_names[9]
ivrank60 = column_names[10]
ivper60 = column_names[11]
ivrate60 =column_names[12]
ivrank90 = column_names[13]
ivper90 = column_names[14]
ivrank90 = column_names[15]
ivrank360= column_names[16]
ivper360 = column_names[17]
ivrank360 = column_names[18]
values = data[0]
results = values[2] #the correction
print(results)
I'm trying to extract the hash rate for 3 cryptocurrencies and I have attached the code for the same below. Now, I want to pass three urls and in return I need three different different dictionaries which should have the values. I'm stuck and I don't understand how should I go about it. I have tried using loops but it is not working out for me.
url = {'Bitcoin' : 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y',
'Ethereum': 'https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y',
'Litecoin': 'https://bitinfocharts.com/comparison/litecoin-hashrate.html'}
for ele in url:
#### requesting the page and extracting the script which has date and values
session = requests.Session()
page = session.get(ele[i])
soup = BeautifulSoup(page.content, 'html.parser')
values = str(soup.find_all('script')[4])
values = values.split('d = new Dygraph(document.getElementById("container"),')[1]
#create an empty dict to append date and hashrates
dict([("crypto_1 %s" % i,[]) for i in range(len(url))])
#run a loop over all the dates and adding to dictionary
for i in range(values.count('new Date')):
date = values.split('new Date("')[i+1].split('"')[0]
value = values.split('"),')[i+1].split(']')[0]
dict([("crypto_1 %s" % i)[date] = value
You can use next example how to get data from all 3 URLs and create a dataframe/dictionary from it:
import re
import requests
import pandas as pd
url = {
"Bitcoin": "https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y",
"Ethereum": "https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y",
"Litecoin": "https://bitinfocharts.com/comparison/litecoin-hashrate.html",
}
data = []
for name, u in url.items():
html_doc = requests.get(u).text
for date, hash_rate in re.findall(
r'\[new Date\("(.*?)"\),(.*?)\]', html_doc
):
data.append(
{
"Name": name,
"Date": date,
"Hash Rate": float("nan")
if hash_rate == "null"
else float(hash_rate),
}
)
df = pd.DataFrame(data)
df["Date"] = pd.to_datetime(df["Date"])
# here save df to CSV
# this will create a dictionary, where the keys are crypto names and values
# are dicts with keys Date/HashRate:
out = {}
for name, g in df.groupby("Name"):
out[name] = g[["Date", "Hash Rate"]].to_dict(orient="list")
print(out)
Prints:
{
"Bitcoin": {
"Date": [
Timestamp("2009-01-03 00:00:00"),
Timestamp("2009-01-04 00:00:00"),
Timestamp("2009-01-05 00:00:00"),
...
I am working on a project for a python class, where we have to create a vin number look up tool. This code below works so far for country and year made. However I am having an issue with how I am indexing the user input.
Sometimes a country is the first two characters, and other times the country code is only the first character. I do not how to get around this problem, I tried using an if else kind of method while iterating through the dictionary, but it did not work.
class vinfo():
def __init__(self):
""" Get user input """
vin = input("Enter Vin: ")
""" Stupidity check """
unaccepted_chars = [
"-","/",".",","," ","?",
"^","$","*","(",")","[",
"]","{","}","#","!","_",
"+","'","=","|","#","<",
">","`","~","&"
]
for char in vin:
if char in unaccepted_chars:
vin = vin.replace(char, "")
else:
print("", end = "")
""" Length check and index """
if len(vin) == 17:
self.country_filt1 = vin[0]
self.country_filt2 = vin[0:2]
self.engine_filt = vin[7]
self.year_filt = vin[9]
self.manufact_filt = vin[1:3]
self.manufact_filt1 = vin[1:4]
else:
print("You must've entered something really stupid.\n(must be 17 characters, letters are uppercase)")
""" Vin Code Data """
# each manufacturer seems to have multiple vins for different car styles
# and for different countries
manufact_codes = {
"1G1":"Chevy",
"":"",
}
year_codes = {
"M":"2021","L":"2020","K":"2019",
"J":"2018","H":"2017","G":"2016",
"F":"2015","E":"2014","D":"2013",
"C":"2012","B":"2011","A":"2010",
"9":"2009","8":"2008","7":"2007",
"6":"2006","5":"2005","4":"2004",
"3":"2003","2":"2002","1":"2001",
"Y":"2000","X":"1999","W":"1998",
"V":"1997","T":"1996","S":"1995",
"R":"1994","P":"1993","N":"1992",
"M":"1991","L":"1990","K":"1989",
"J":"1988","H":"1987","G":"1986",
"F":"1985","E":"1984","D":"1983",
"C":"1982","B":"1981","A":"1980"
}
country_codes = {
"1": "USA",
"4": "USA",
"5": "USA",
"7F": "USA",
"3X": "Mexico",
"37": "Mexico",
"3A": "Canada",
"3W": "Canada",
"W": "Germany",
"SA": "United Kingdom",
"SM": "United Kingdom",
"J": "Japan",
"KL": "Korea",
"KR": "Korea"
}
engine_codes = {
}
""" Define the vehicles attributes using the find() function below """
self.year = self.find(self.year_filt, year_codes)[0]
# self.country = self.find(self.country_filt, country_codes)[0]
# self.manufact = self.find(self.manufact_filt, manufact_codes)
self.engine = self.find(self.engine_filt, engine_codes)
""" Find country (different lengths of country codes) """
for key, value in country_codes.items():
if key == self.country_filt1:
country = value
elif key == self.country_filt2:
country = value
else:
country = "Unsupported code"
""" Same for manufacturer """
for key, value in manufact_codes.items():
if key == self.manufact_filt:
manufact = value
elif key == self.manufact_filt1:
manufact = value
else:
manufact = "Unsupported code"
self.info = print(f"Year: {self.year}\nManufacturer: {manufact}\nCountry: {country}\nEngine: {self.engine}")
""" search through the dictionaries """
def find(self, filt, dict_of_codes):
try:
info = [value for key, value in dict_of_codes.items() if key == filt]
except:
info = "Unsupported"
if len(info) > 1:
info += " (Could be any of these)"
return info
I created this code where I am able to pull the data I want but not able to sort it as it should be. I am guessing it has to do with the way I am appending each item by ignoring index but I can't find my way around it.
This is my code:
import json
import pandas as pd
#load json object
with open("c:\Sample.json","r",encoding='utf-8') as file:
data = file.read()
data2 = json.loads(data)
print("Type:", type(data2))
cls=['Image', 'Email', 'User', 'Members', 'Time']
df = pd.DataFrame(columns = cls )
for d in data2['mydata']:
for k,v in d.items():
#print(k)
if k == 'attachments':
#print(d.get('attachments')[0]['id'])
image = (d.get('attachments')[0]['id'])
df=df.append({'Image':image},ignore_index = True)
#df['Message'] = image
if k == 'author_user_email':
#print(d.get('author_user_email'))
email = (d.get('author_user_email'))
df=df.append({'Email':email}, ignore_index = True)
#df['Email'] = email
if k == 'author_user_name':
#print(d.get('author_user_name'))
user = (d.get('author_user_name'))
df=df.append({'User':user}, ignore_index = True)
#df['User'] = user
if k == 'room_name':
#print(d.get('room_name'))
members = (d.get('room_name'))
df=df.append({'Members':members}, ignore_index = True)
#df['Members'] = members
if k == 'ts_iso':
#print(d.get('ts_iso'))
time = (d.get('ts_iso'))
df=df.append({'Time':time}, ignore_index = True)
#df['Time'] = time
df
print('Finished getting Data')
df1 = (df.head())
print(df)
print(df.head())
df.to_csv(r'c:\sample.csv', encoding='utf-8')
The code gives me this as the result
I am looking to get this
Data of the file is this:
{
"mydata": [
{
"attachments": [
{
"filename": "image.png",
"id": "888888888"
}
],
"author_user_email": "email#email.com",
"author_user_id": "91",
"author_user_name": "Marlone",
"message": "",
"room_id": "999",
"room_members": [
{
"room_member_id": "91",
"room_member_name": "Marlone"
},
{
"room_member_id": "9191",
"room_member_name": " +16309438985"
}
],
"room_name": "SMS [Marlone] [ +7777777777]",
"room_type": "sms",
"ts": 55,
"ts_iso": "2021-06-13T18:17:32.877369+00:00"
},
{
"author_user_email": "email#email.com",
"author_user_id": "21",
"author_user_name": "Chris",
"message": "Hi",
"room_id": "100",
"room_members": [
{
"room_member_id": "21",
"room_member_name": "Joe"
},
{
"room_member_id": "21",
"room_member_name": "Chris"
}
],
"room_name": "Direct [Chris] [Joe]",
"room_type": "direct",
"ts": 12345678910,
"ts_iso": "2021-06-14T14:42:07.572479+00:00"
}]}
Any help would be appreciated. I am new to python and am learning on my own.
Try:
import json
import pandas as pd
with open("your_data.json", "r") as f_in:
data = json.load(f_in)
tmp = []
for d in data["mydata"]:
image = d.get("attachments", [{"id": None}])[0]["id"]
email = d.get("author_user_email")
user = d.get("author_user_name")
members = d.get("room_name")
time = d.get("ts_iso")
tmp.append((image, email, user, members, time))
df = pd.DataFrame(tmp, columns=["Image", "Email", "User", "Members", "Time"])
print(df)
Prints:
Image Email User Members Time
0 888888888 email#email.com Marlone SMS [Marlone] [ +7777777777] 2021-06-13T18:17:32.877369+00:00
1 None email#email.com Chris Direct [Chris] [Joe] 2021-06-14T14:42:07.572479+00:00
Although the other answer does work, pandas has a built in reader for json files pd.read_json: https://pandas.pydata.org/pandas-docs/version/1.1.3/reference/api/pandas.read_json.html
It has the benefit of being able to handle very large datasets via chunking, as well as processing quite a few different formats. The other answer would not be performant for a large dataset.
This would get you started:
import pandas as pd
df = pd.read_json("c:\Sample.json")
The probblem is that append() adds a new row. So, you have to use at[] https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.at.html specifying the index/row. Se below. Some print/debug messages were left and path to input and output files was changed a little because I'm on Linux.
import json
import pandas as pd
import pprint as pp
#load json object
with open("Sample.json","r",encoding='utf-8') as file:
data = file.read()
data2 = json.loads(data)
#pp.pprint(data2)
cls=['Image', 'Email', 'User', 'Members', 'Time']
df = pd.DataFrame(columns = cls )
pp.pprint(df)
index = 0
for d in data2['mydata']:
for k,v in d.items():
#print(k)
if k == 'attachments':
#print(d.get('attachments')[0]['id'])
image = (d.get('attachments')[0]['id'])
df.at[index, 'Image'] = image
#df['Message'] = image
if k == 'author_user_email':
#print(d.get('author_user_email'))
email = (d.get('author_user_email'))
df.at[index, 'Email'] = email
#df['Email'] = email
if k == 'author_user_name':
#print(d.get('author_user_name'))
user = (d.get('author_user_name'))
df.at[index, 'User'] = user
#df['User'] = user
if k == 'room_name':
#print(d.get('room_name'))
members = (d.get('room_name'))
df.at[index, 'Members'] = members
#df['Members'] = members
if k == 'ts_iso':
#print(d.get('ts_iso'))
time = (d.get('ts_iso'))
df.at[index, 'Time'] = time
#df['Time'] = time
index += 1
# start indexing from 0
df.reset_index()
# replace empty str/cells witn None
df.fillna('None', inplace=True)
pp.pprint(df)
print('Finished getting Data')
df1 = (df.head())
print(df)
print(df.head())
df.to_csv(r'sample.csv', encoding='utf-8')