Convert columns in dataframe with comas into numeric data to plotting - python

I'm new in the world of plotting in Python I started learning today doing a mini project by my own, I tried to scrape data and represent here's my code:
import requests
import pandas as pd
from pandas import DataFrame
import numpy as np
import bs4
from bs4 import BeautifulSoup
import matplotlib.pyplot as plot
# Getting the HTML page
URL = "https://www.worldometers.info/coronavirus/#countries"
pag_html = requests.get(URL).text
# Extracting data with BeautifulSoup.
soup = BeautifulSoup(pag_html, 'html.parser')
tabla = soup.find("table", id="main_table_countries_today")
datos_tabla = tabla.tbody.find_all("tr")
Lista = []
for x in range(len(datos_tabla)):
values = [j.string for j in datos_tabla[x].find_all('td')]
Lista.append(values)
df = pd.DataFrame(Lista).iloc[7: , 1:9]
nombre_columna = ["Pais", "Casos totales", "Nuevos Casos", "Muertes totales", "Nuevas Muertes", "Total Recuperados", "Nuevos Recuperados", "Activos"]
df.columns = nombre_columna
df.plot(x="Pais", y="Casos totales", kind ="barh")
plot.show()
The error it's giving me is: "TypeError: no numeric data to plot" I understand that this error is because the column "Casos totales" is a string not a float.
I tried to convert the columns of my Dataframe into floats, but there's no way I got error from everywhere.
Does anyone have any idea how can I represent my DataFrame?
Thanks.

After running the script, as you say the column "Casos Totales" is being interpreted as string due to the commas in the values. You can change this using .str.replace(',','') and then .astype(float), right after renaming the column names in your dataframe:
df['Casos totales'] = df['Casos totales'].str.replace(',','').astype(float)
df.plot(x="Pais", y="Casos totales", kind ="barh")
plot.show()
And this plots the graph (although the visualization is quite poor, but that's another story)

Related

Adding tooltip to folium.features.GeoJson from a geopandas dataframe

I am having issues adding tooltips to my folium.features.GeoJson. I can't get columns to display from the dataframe when I select them.
feature = folium.features.GeoJson(df.geometry,
name='Location',
style_function=style_function,
tooltip=folium.GeoJsonTooltip(fields= [df.acquired],aliases=["Time"],labels=True))
ax.add_child(feature)
For some reason when I run the code above it responds with
Name: acquired, Length: 100, dtype: object is not available in the data. Choose from: ().
I can't seem to link the data to my tooltip.
have made your code a MWE by including some data
two key issues with your code
need to pass properties not just geometry to folium.features.GeoJson() Hence passed df instead of df.geometry
folium.GeoJsonTooltip() takes a list of properties (columns) not an array of values. Hence passed ["acquired"] instead of array of values from a dataframe column
implied issue with your code. All dataframe columns need to contain values that can be serialised to JSON. Hence conversion of acquired to string and drop()
import geopandas as gpd
import pandas as pd
import shapely.wkt
import io
import folium
df = pd.read_csv(io.StringIO("""ref;lanes;highway;maxspeed;length;name;geometry
A3015;2;primary;40 mph;40.68;Rydon Lane;MULTILINESTRING ((-3.4851169 50.70864409999999, -3.4849879 50.7090007), (-3.4857269 50.70693379999999, -3.4853034 50.7081574), (-3.488620899999999 50.70365289999999, -3.4857269 50.70693379999999), (-3.4853034 50.7081574, -3.4851434 50.70856839999999), (-3.4851434 50.70856839999999, -3.4851169 50.70864409999999))
A379;3;primary;50 mph;177.963;Rydon Lane;MULTILINESTRING ((-3.4763853 50.70886769999999, -3.4786112 50.70811229999999), (-3.4746017 50.70944449999999, -3.4763853 50.70886769999999), (-3.470350900000001 50.71041779999999, -3.471219399999999 50.71028909999998), (-3.465049699999999 50.712158, -3.470350900000001 50.71041779999999), (-3.481215600000001 50.70762499999999, -3.4813909 50.70760109999999), (-3.4934747 50.70059599999998, -3.4930204 50.7007898), (-3.4930204 50.7007898, -3.4930048 50.7008015), (-3.4930048 50.7008015, -3.4919513 50.70168349999999), (-3.4919513 50.70168349999999, -3.49137 50.70213669999998), (-3.49137 50.70213669999998, -3.4911565 50.7023015), (-3.4911565 50.7023015, -3.4909108 50.70246919999999), (-3.4909108 50.70246919999999, -3.4902349 50.70291189999999), (-3.4902349 50.70291189999999, -3.4897693 50.70314579999999), (-3.4805021 50.7077218, -3.4806265 50.70770150000001), (-3.488620899999999 50.70365289999999, -3.4888806 50.70353719999999), (-3.4897693 50.70314579999999, -3.489176800000001 50.70340539999999), (-3.489176800000001 50.70340539999999, -3.4888806 50.70353719999999), (-3.4865751 50.70487679999999, -3.4882604 50.70375799999999), (-3.479841700000001 50.70784459999999, -3.4805021 50.7077218), (-3.4882604 50.70375799999999, -3.488620899999999 50.70365289999999), (-3.4806265 50.70770150000001, -3.481215600000001 50.70762499999999), (-3.4717096 50.71021009999998, -3.4746017 50.70944449999999), (-3.4786112 50.70811229999999, -3.479841700000001 50.70784459999999), (-3.471219399999999 50.71028909999998, -3.4717096 50.71021009999998))"""),
sep=";")
df = gpd.GeoDataFrame(df, geometry=df["geometry"].apply(shapely.wkt.loads), crs="epsg:4326")
df["acquired"] = pd.date_range("8-feb-2022", freq="1H", periods=len(df))
def style_function(x):
return {"color":"blue", "weight":3}
ax = folium.Map(
location=[sum(df.total_bounds[[1, 3]]) / 2, sum(df.total_bounds[[0, 2]]) / 2],
zoom_start=12,
)
# data time is not JSON serializable...
df["tt"] = df["acquired"].dt.strftime("%Y-%b-%d %H:%M")
feature = folium.features.GeoJson(df.drop(columns="acquired"),
name='Location',
style_function=style_function,
tooltip=folium.GeoJsonTooltip(fields= ["tt"],aliases=["Time"],labels=True))
ax.add_child(feature)

Pandas Python probelm

import pandas as pd
nba = pd.read_csv("nba.csv")
names = pd.Series(nba['Name'])
data = nba['Salary']
nba_series = (data, index=[names])
print(nba_series)
Hello I am trying to convert the columns 'Name' and 'Salary' into a series from a dataframe. I need to set the names as the index and the salaries as the values but i cannot figure it out. this is my best attempt so far anyone guidance is appreciated
I think you are over-thinking this. Simply construct it with pd.Series(). Note the data needs to be with .values, otherwis eyou'll get Nans
import pandas as pd
nba = pd.read_csv("nba.csv")
nba_series = pd.Series(data=nba['Salary'].values, index=nba['Name'])
Maybe try set_index?
nba.set_index('name', inlace = True )
nba_series = nba['Salary']
This might help you
import pandas as pd
nba = pd.read_csv("nba.csv")
names = nba['Name']
#It's automatically a series
data = nba['Salary']
#Set names as index of series
data.index = nba_series
data.index = names might be correct but depends on the data

mask function doesn't get rid of unwanted data

I'm working on a data frame taken from Adafruit IO and sadly some of my data is from a time when my project malfunctioned so some of the values are just equal NaN.
I tried to remove it by typing this code lines:
onlyValidData=temp_data.mask(temp_data['value'] =='NaN')
onlyValidData
This is data retreived from Adafruit IO Feed, getting analyzed by pandas, I tried using 'where' function too but it didn't work
my entire code is
import pandas as pd
temp_data = pd.read_json('https://io.adafruit.com/api/(...)')
light_data = pd.read_json('https://io.adafruit.com/api/(...)')
temp_data['created_at'] = pd.to_datetime(temp_data['created_at'], infer_datetime_format=True)
temp_data = temp_data.set_index('created_at')
light_data['created_at'] = pd.to_datetime(light_data['created_at'], infer_datetime_format=True)
light_data = light_data.set_index('created_at')
tempVals = pd.Series(temp_data['value'])
lightVals = pd.Series(light_data['value'])
onlyValidData=temp_data.mask(temp_data['value'] =='NaN')
onlyValidData
The output is all of my data for some reason, but it should be only the valid values.
Hey I think the issue here that you're looking for values equal to the string 'NaN', while actual NaN values aren't a string, or more specifically aren't anything.
Try using:
onlyValidData = temp_data.mask(temp_data['value'].isnull())
Edit: to remove rows rather than marking all values in that row as NaN:
onlyValidData = temp_data.dropna()

Parsing output from scraped webpage using pandas and bs4: way to make output more readable?

I wanted to scrape this page.
I wrote this code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
res = requests.get("http://yadamp.unisa.it/showItem.aspx?yadampid=18")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df[0].to_json(orient='records'))
But the output isn't ideal. The output is:
[{"0":"ID","1":"18","2":"NAME","3":"Colutellin-A Blast NCBI-PROT","4":null,"5":null},{"0":"LENGTH","1":"7","2":"DISULFIDE BRIDGE","3":null,"4":"View PDB \/\/ Small molecules can be embedded in the page var glmol02 = new GLmol('glmol02');","5":null},{"0":"SEQUENCE","1":"VISIIPV","2":null,"3":null,"4":null,"5":null},{"0":"HELICITY","1":"85.70","2":"INSTAB. INDEX","3":"31.97","4":"FLEXIBILITY","5":"5.43"},{"0":"a HYD. MOM.","1":"16.35","2":"b HYD. MOM.","3":"9.04","4":"c HYD. MOM","5":"1.37"},{"0":"a MEAN HYD. MOM.","1":"2.34","2":"b MEAN HYD. MOM.","3":"1.29","4":"c MEAN HYD. MOM.","5":"0.20"},{"0":"CHARGE pH5","1":"0.00","2":"CHARGE pH7","3":"0.00","4":"CHARGE pH9","5":"-0.17"},{"0":"\u0394 CHARGE pH5-pH9","1":"0.17","2":"ISOELECTRIC POINT","3":"5.49","4":"BOMAN INDEX","5":"-2.78"},{"0":"\u0394G","1":"-368","2":"CPP","3":"-027","4":"MLP","5":"-006"},{"0":"MOLECULAR VOLUME","1":null,"2":"POLARITY","3":null,"4":null,"5":null},{"0":"MIC E. coli","1":null,"2":"MIC P. aeruginosa","3":null,"4":"MIC S. typhimurium","5":null},{"0":"MIC S. aureus","1":null,"2":"MIC M. luteus","3":null,"4":"MIC B. subtilis","5":null},{"0":"MIC C. albicans","1":null,"2":"OTHER","3":"S.sclerotiorum = 30.86; B.cinerea = 10.29","4":null,"5":null},{"0":"MIC OTHER gram+","1":null,"2":null,"3":null,"4":null,"5":null},{"0":"MIC OTHERgram-","1":null,"2":null,"3":null,"4":null,"5":null},{"0":"PHYLUM","1":"Ascomycota","2":"CLASS","3":"Sordariomycetes","4":"ORDER","5":"Glomerellales"},{"0":"FAMILY","1":"Glomerellaceae","2":"GENUS","3":"Colletotrichum","4":"SPECIES","5":"Colletotrichum dematium"},{"0":"DATE","1":"2008","2":null,"3":null,"4":null,"5":null},{"0":"TITLE PAPER","1":"Colutellin A, an immunosuppressive peptide from Colletotrichum dematium","2":null,"3":null,"4":null,"5":null}]
You can see it's difficult for me to make sense of this list, because I have to loop through a list of multiple dictionaries, and then join pairs of keys together. I was hoping the output would be more like:
ID 18
Name Colutellin-A
Helicity 85.7
etc....just something more readable. Can anyone pinpoint a section of the code that I should change to improve this?
Thanks
You can use pandas read_html() to get the table and then navigate the table using pandas DataFrame(), see the code below!
url = 'http://yadamp.unisa.it/showItem.aspx?yadampid=18'
table = pd.read_html(url, attrs={
'class': 'table table-responsive'}, header=0)
print(pd.DataFrame(table[0]))

Normalize columns in a numpy array- results in typeerror

want to do a simple normalization of the data in a numpy ndarray.
specifically want X-mu/sigma. Tried using the exact code that
that I found in earlier questions - kept getting error = TypeError
cannot perform reduce with flexible type. Gave up and tried a simpler
normzlization method X-mu/X.ptp - got the same error.
import csv
import numpy as np
from numpy import *
import urllib.request
#Import comma separated data from git.hub
url = 'http://archive.ics.uci.edu/ml/machine-learning-
databases/wine/wine.data'
urllib.request.urlretrieve(url,'F:/Python/Wine Dataset/wine_data')
#open file
filename = 'F:/Python/Wine Dataset/wine_data';
raw_data = open(filename,'rt');
#Put raw_data into a numpy.ndarray
reader = csv.reader(raw_data);
x = list(reader);
data = np.array(x)
#First column is classification, other columns are features
y= data[:,0];
X_raw = data[:,1:13];
# Attempt at normalizing data- really wanted X-mu/sigma gave up
# even this simplified version doesn't work
# latest error is TypeError cannot perform reduce with flexible type?????
X = (X_raw - X_raw.min(0)) / X_raw.ptp(0);
print(X);
#
#
#
#
Finally figured it out. The line "data = np.array(x)" returned an array containing string data.
was:
data = "np.array(x)"
changed to: "np.array(x).astype(np.float)"
after that everything worked - simple issue cost me hours

Categories

Resources