import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
from time import sleep
from random import randint
import re
towns = pd.DataFrame()
town_names = [f"Abbeville-Alabama",
f"Abernant-Alabama",
f"Alpine-Utah",
f"Dixon-Montana",
f"Adak-Alaska",]
for town_name in town_names:
page = requests.get(f"https://www.city-data.com/city/{town_name}.html").text
doc = BeautifulSoup(page, "html.parser")
print(town_name)
sex_population = str(doc.find(id="population-by-sex"))
(males, females) = [float(x) for x in re.findall(r"(?<=\()[0-9]+\.[0-9]+(?=\%\))", sex_population)]
print(males, females)
# poverty_level = str(doc.find(id="poverty-level"))
# broke = float(re.findall("(<?<\/b> )[0-9]*.[0-9]*", poverty_level))
# print(broke)
# religion_population = str(doc.find(id="religion"))
# atheist = float(re.findall("(?<=None<\/td><td>)[0-9,]*(?=<\/td><td>)", religion_population)[0].replace(",", ""))
# print(atheist)
total_population = str(doc.find(id="city-population"))
residents = float(re.findall("(?<=</b> )[0-9]*", total_population)[0].replace(",", ""))
print(residents)
religion_population = doc.find(id="religion").find_all('tr')
data = []
for row in religion_population:
columns = row.find_all('td')
if columns:
religion = columns[0].get_text(strip=True)
number = columns[1].get_text(strip=True).replace(",", "").replace("-","0")
print(f'religion: {religion} | number: {number}')
data.append([religion, int(number)])
df = pd.DataFrame(data, columns=['religion', 'number'])
df['percentage'] = (df['number'] / df['number'].sum()) * 100
atheist=df[df.religion == "None"].iloc[0]["percentage"]
evangelicals = df[df.religion == "Evangelical Protestant"].iloc[0]["percentage"]
print(atheist)
print(evangelicals)
education_population = doc.find(id="education-info").find_all('b')
data = []
for row in education_population:
columns = row.find_all('b')
if columns:
education = columns[0].get_text(strip=True)
ed_number = columns[1].get_text(strip=True).replace(",", "").replace("-", "0")
print(f'education: {education} | number: {ed_number}')
data.append([education, int(ed_number)])
df = pd.DataFrame(data, columns=['education', 'number'])
df['percentage'] = (df['number'] / df['number'].sum()) * 100
phds = df[df.education == "Graduate or professional degree"].iloc[0]["percentage"]
highschoolgrads = df[df.education == "High school or higher"].iloc[0]["percentage"]
print(phds)
print(highschoolgrads)
print("\n")
How do I get the education information into a dataframe? I'm attempting to organize the values into education level and percentage.
Also any idea why when I try to cast poverty level as a float in broke, it says it can't because it's a list?
At this point I'm just typing so that stackoverflow will allow me to post because it thinks I don't have enough details. SO... if anyone is interested what this is for I'm doing a datamining/machine learning project where it's going to take the scores from the HRC's municipality equality index and the info about the cities scored and try to learn how to estimate the score of cities which haven't been scored. It's my first machine learning project ever. I worry that I've picked something too big for my first time but I'm really already committed to it.
There is a problem in the scraper.
In education, in the URL it seems that the values are in a list element li and the title/label is in b inside li .
In the scraper, you are just collecting the b tags, soo you're missing the required numeric value. You can change that to make it work.
Thanks!
i'm having a problem trying to count diferents variables for the same Name. The thing is: i have a sheet with the Name of all my workers and i need to count how many trainings they had, but thoses trainings have different classifications: "Comercial", "Funcional" and others...
One of my columns is "Name" and the other is "Trainings". How can i filter those trainings and aggregate per name
import pandas as pd
import numpy as np
xls = pd.ExcelFile('BASE_Indicadores_treinamento_2021 - V3.xlsx')
df = pd.read_excel(xls, 'Base')
display(df)
df2 = df.groupby("Nome").agg({'Eixo':'count'}).reset_index()
display(df2)
What im getting is the TOTAL of trainings per Name, but i need the count of all categories i have in trainings (there are 5 of them). Does anyone know what i need to do?
Thankss
df.groupby("Nome").agg('count') should give you the total number of training for each person.
df.groupby(["Nome","Eixo"]).agg({'Eixo':'count'}) should give you the count per each person per each training.
Problem solved!
Here's what i did
import pandas as pd
import numpy as np
xls = pd.ExcelFile('BASE_Indicadores_treinamento_2021 - V3.xlsx')
df = pd.read_excel(xls, 'Base')
display(df)
filt_funcional = df['Eixo'] == 'Funcional'
filt_comercial = df['Eixo'] == 'Comercial'
filt_liderança = df['Eixo'] == 'Liderança'
filt_negocio = df['Eixo'] == 'Negócio'
filt_obr_cert = df['Eixo'] == 'Obrigatórios e Certificações'
df.loc[filt_funcional]['Nome'].value_counts()
Much easier than i thought!
And giving the credits, i only did bc of this video: https://www.youtube.com/watch?v=txMdrV1Ut64
import pandas as pd
nba = pd.read_csv("nba.csv")
names = pd.Series(nba['Name'])
data = nba['Salary']
nba_series = (data, index=[names])
print(nba_series)
Hello I am trying to convert the columns 'Name' and 'Salary' into a series from a dataframe. I need to set the names as the index and the salaries as the values but i cannot figure it out. this is my best attempt so far anyone guidance is appreciated
I think you are over-thinking this. Simply construct it with pd.Series(). Note the data needs to be with .values, otherwis eyou'll get Nans
import pandas as pd
nba = pd.read_csv("nba.csv")
nba_series = pd.Series(data=nba['Salary'].values, index=nba['Name'])
Maybe try set_index?
nba.set_index('name', inlace = True )
nba_series = nba['Salary']
This might help you
import pandas as pd
nba = pd.read_csv("nba.csv")
names = nba['Name']
#It's automatically a series
data = nba['Salary']
#Set names as index of series
data.index = nba_series
data.index = names might be correct but depends on the data
I have dataframe like this.
import pandas as pd
#create dataframe
df= pd.DataFrame({"Date":range(0,22),
"Country":["USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA",],
"Number":[0,0,0,0,0,1,1,3,5,6,4,6,7,8,7,10,25,50,75,60,45,100]
"Number is Corrected":[0,0,0,0,0,1,1,3,5,6,6,6,7,7,7,10,25,50,50,60,60,100]})
But this dataframe is have a problem. Some numbers are wrong.
Previous number always has to be smaller than next number(6,4,6,,7,8,7...50,75,60,45,100)
I don't use df.sort because it's not about sorting it's about correction.
Edit: I added corrected numbers in "number is corrected" column.
guessing from your 'Number corrected' list, you could probably use this:
import pandas as pd
#create dataframe
df= pd.DataFrame({"Date":range(0,22),
"Country":["USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","USA",],
"Number":[0,0,0,0,0,1,1,3,5,6,4,6,7,8,7,10,25,50,75,60,45,100]})
# "Number is Corrected":[0,0,0,0,0,1,1,3,5,6,6,6,7,7,7,10,25,50,50,60,60,100]})
def correction():
df['Number is Corrected'] = df['Number']
cache = 0
for num, content in enumerate(df['Number is Corrected'], start=0):
if(df['Number is Corrected'][num] < cache):
df['Number is Corrected'][num] = cache
else:
cache = df['Number is Corrected'][num]
print(df)
if __name__ == "__main__":
correction()
but there is some inconsistency, like your conversation with jezrael. Evtl. you'll need to update the logic of the code, if it gets clearer, what the output you wished. Good luck.
In a dataframe, I have a column "UnixTime" and want to convert it to a new column containing the UTC time.
import pandas as pd
from datetime import datetime
df = pd.DataFrame([1565691196, 1565691297, 1565691398], columns = ["UnixTime"])
unix_list = df["UnixTime"].tolist()
utc_list = []
for i in unix_list:
i = datetime.utcfromtimestamp(i).strftime('%Y-%m-%d %H:%M:%S')
utc_list.append(i)
df["UTC"] = utc_list
This works, but I guess there is a smarter approach?
Could you try this:
df["UTC"] = pd.to_datetime(df['UnixTime'], unit='s')
If you mean by smarter approach is pandas-way and less code, then this is your answer :
df["UTC"] = pd.to_datetime(df["UnixTime"], unit = "s")
Hope this helps.