Filter Pandas dataframe with user input - python

I'm trying to develop this code where I would have certain inputs for different variables, these would make the filter happen and return the filtered dataframe, this input will always only receive a single value that the user will choose amoung fewer options and if the input is empty, that filter must bring all the data.
I didn't put the user input because I was testing the function first, however, the function always returns an empty dataframe and I can't find out why. Here is the code I was developing:
I didn't put the dataframe because it comes from an excel, but if necessary I'll put together a sample that fits
df = pd.DataFrame({"FarolAging":["Vermelho","Verde","Amarelo"],"Dias Pendentes":["20 dias","40 dias","60 dias"],"Produto":["Prod1","Prod1","Prod2"],
"Officer":["Alexandre Denardi","Alexandre Denardi","Lucas Fernandes"],"Analista":["Guilherme De Oliveira Moura","Leonardo Silva","Julio Cesar"],
"Coord":["Anna Claudia","Bruno","Bruno"]})
FarolAging1 = ['Vermelho']
DiasPendentes = []
Produto = []
Officer = []
def func(FarolAging1,DiasPendentes,Produto,Officer):
if len(Officer) <1:
Officer = df['Officer'].unique()
if len(FarolAging1) <1:
FarolAging1 = df['FarolAging'].unique()
if len(DiasPendentes) <1:
DiasPendentes = df['Dias Pendentes'].unique()
if len(Produto) <1:
Produto = df['Produto'].unique()
dados2 = df.loc[df['FarolAging'].isin([FarolAging1]) & (df['Dias Pendentes'].isin([DiasPendentes])) & (df['Produto'].isin([Produto])) & (df['Officer'].isin([Officer]))]
print(dados2)
func(FarolAging1, DiasPendentes, Produto, Officer) ```

You have to remove the square brackets in isin because you already have lists:
def func(FarolAging1,DiasPendentes,Produto,Officer):
if len(Officer) <1:
Officer = df['Officer'].unique()
if len(FarolAging1) <1:
FarolAging1 = df['FarolAging'].unique()
if len(DiasPendentes) <1:
DiasPendentes = df['Dias Pendentes'].unique()
if len(Produto) <1:
Produto = df['Produto'].unique()
# Transform .isin([...]) into .isin(...)
dados2 = (df.loc[df['FarolAging'].isin(FarolAging1)
& (df['Dias Pendentes'].isin(DiasPendentes))
& (df['Produto'].isin(Produto))
& (df['Officer'].isin(Officer))])
print(dados2)
return dados2 # don't forget to return something
Output:
>>> func(FarolAging1, DiasPendentes, Produto, Officer)
FarolAging Dias Pendentes Produto Officer Analista Coord
0 Vermelho 20 dias Prod1 Alexandre Denardi Guilherme De Oliveira Moura Anna Claudia

Related

Python Index out of range Error in lib loop issue

everything's fine? I hope so.
I'm dealing with this issue: List index out of range. -
Error message:
c:\Users.....\Documents\t.py:41: FutureWarning: As the xlwt package is no longer maintained, the xlwt engine will be removed in a future version of pandas. This is the only engine in pandas that supports writing in the xls format. Install openpyxl and write to an xlsx file instead. You can set the option io.excel.xls.writer to 'xlwt' to silence this warning. While this option is deprecated and will also raise a warning, it can be globally set and the warning suppressed.
read_file.to_excel(planilhaxls, index = None, header=True)
The goal: I need to create a loop that store a specific line of a worksheet such as sheet_1.csv, this correspondent line in sheet_2.csv and a third sheet also, stored in 3 columns in a sheet_output.csv
Issue: It's getting an index error out of range that I don't know what to do
Doubt: There is any other way that I can do it?
The code is below:
(Please, ignore portuguese comments)
import xlrd as ex
import pyautogui as pag
import os
import pyperclip as pc
import pandas as pd
import pygetwindow as pgw
import openpyxl
#Inputs
numerolam = int(input('Escolha o número da lamina: '))
amostra = input('Escoha a amostra: (X, Y, W ou Z): ')
milimetro_inicial = int(input("Escolha o milimetro inicial: "))
milimetro_final = int(input("Escolha o milimetro final: "))
tipo = input("Escolha o tipo - B para Branco & E para Espelho: ")
linha = int(input("Escolha a linha da planilha: "))
# Conversão de código
if tipo == 'B':
tipo2 = 'BRA'
else:
tipo2 = 'ESP'
#Arquivo xlsx
#planilhaxlsx = f'A{numerolam}{amostra}{milimetro_inicial}{tipo2}.xlsx'
#planilhaxls = f'A{numerolam}{amostra}{milimetro_inicial}{tipo2}.xls'
#planilhacsv = f'A{numerolam}{amostra}{milimetro_inicial}{tipo2}.csv'
#planilhacsv_ = f'A{numerolam}{amostra}{milimetro_final}{tipo2}.csv'
#arquivoorigin = f'A{numerolam}{amostra}{milimetro_inicial}{tipo2}.opj'
#Pasta
pasta = f'L{numerolam}{amostra}'
while milimetro_inicial < milimetro_final:
planilhaxlsx = f'A{numerolam}{amostra}{milimetro_inicial}{tipo2}.xlsx'
planilhaxls = f'A{numerolam}{amostra}{milimetro_inicial}{tipo2}.xls'
planilhacsv = f'A{numerolam}{amostra}{milimetro_inicial}{tipo2}.csv'
planilhacsv_ = f'A{numerolam}{amostra}{milimetro_final}{tipo2}.csv'
arquivoorigin = f'A{numerolam}{amostra}{milimetro_inicial}{tipo2}.opj'
# Converte o arquivo .csv para .xls e .xlsx
read_file = pd.read_csv(planilhacsv)
read_file.to_excel(planilhaxls, index = None, header=True)
#read_file.to_excel(planilhaxlsx, index = None, header=True)
# Abre o arquivo .xls com o xlrd - arquivo excel.
book = ex.open_workbook(planilhaxls)
sh = book.sheet_by_index(0)
# Declaração de variáveis.
coluna_inicial = 16 # Q - inicia em 0
valor = []
index = 0
# Loop que armazena o valor da linha pela coluna Q-Z na variável valor 0-(len-1)
while coluna_inicial < 25:
**#ERRO NA LINHA ABAIXO**
**temp = sh.cell_value(linha, coluna_inicial)**
valor.append(temp) # Adiciona o valor
print(index)
print(valor[index])
index +=1
coluna_inicial += 1
# Abre planilha de saída
wb = openpyxl.Workbook()
ws = wb.active
#Inicia loop de escrita
colunas = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
idx_colunas = 0
contador_loop = colunas[idx_colunas]
linha_loop = 1
index_out = 0
s = f'{contador_loop}{linha_loop}'
print(s)
while linha_loop < len(valor):
valor[index_out] = "{}".format(valor[index_out])
ws[s].value = valor[index_out]
print(valor[index_out] + ' feito')
linha_loop += 1
idx_colunas += 1
index_out += 1
# Salva planilha de saída
wb.save("teste.xlsx")
milimetro_inicial += 1
Your problem is on this line
temp = sh.cell_value(linha, coluna_inicial)
There are two index params used linha and coluna_inicial, 'linha' appears to be a static value so the problem would seem to be with 'coluna_inicial' which gets increased by 1 each iteration
coluna_inicial += 1
The loop continues while 'coluna_inicial' value is less than 25. I suggest you check number of columns in the sheet 'sh' using
sh.ncols
either for debugging or as the preferred upper value of your loop. If this is less than 25 you will get the index error once 'coluna_inicial' value exceeds the 'sh.ncols' value.
<---------------Additional Information---------------->
Since this is an xls file there would be no need for delimiter settings, your code as is should open it correctly. However since the xls workbook to be opened is determined by params entered by the user at the start presumably meaning there are a number in the directory to choose from, are you sure you are checking the xls file your code run is opening? Also if there is more than one sheet in the workbook(s) are you opening the correct sheet?
You can print the workbook name to be sure which one is being opened. Also by adding verbosity to the open_workbook command (level 2 should be high enough), it will upon opening the book, print in console details of the sheets available including number of rows and columns in each.
print(planilhaxls)
book = ex.open_workbook(planilhaxls, verbosity=2)
sh = book.sheet_by_index(0)
print(sh.name)
E.g.
BOF: op=0x0809 vers=0x0600 stream=0x0010 buildid=14420 buildyr=1997 -> BIFF80
sheet 0('Sheet1') DIMENSIONS: ncols=21 nrows=21614
BOF: op=0x0809 vers=0x0600 stream=0x0010 buildid=14420 buildyr=1997 ->
BIFF80
sheet 1('Sheet2') DIMENSIONS: ncols=13 nrows=13
the print(sh.name) as shown checks the name of the sheet that 'sh' is assigned to.

Update dictionary within dictionary dynamically return same character count for different parameters

I'm trying to retrieve wikipedia pages' characters count, for articles in different languages. I'm using a dictionary with as key the name of the page and as value a dictionary with the language as key and the count as value.
The code is:
pages = ["L'arte della gioia", "Il nome della rosa"]
langs = ["it", "en"]
dicty = {}
dicto ={}
numz = 0
for x in langs:
wikipedia.set_lang(x)
for y in pages:
pagelang = wikipedia.page(y)
splittedpage = pagelang.content
dicto[y] = dicty
for char in splittedpage:
numz +=1
dicty[x] = numz
If I print dicto, I get
{"L'arte della gioia": {'it': 72226, 'en': 111647}, 'Il nome della rosa': {'it': 72226, 'en': 111647}}
The count should be different for the two pages.
Please try this code. I didn't run it because I don't have the wikipedia module.
Notes:
Since your expected result is dict[page,dict[lan,cnt]], I think first iterate pages is more natural, then iterate languages. Maybe for performance reason you want first iterate languages, please comment.
Characters count of text can simply be len(text), why iterate and sum again?
Variable names. You will soon be lost in x y like variables.
pages = ["L'arte della gioia", "Il nome della rosa"]
langs = ["it", "en"]
dicto = {}
for page in pages:
lang_cnt_dict = {}
for lang in langs:
wikipedia.set_lang(lang)
page_lang = wikipedia.page(page)
chars_cnt = len(pagelang.content)
lang_cnt_dict[lan] = chars_cnt
dicto[page] = lang_cnt_dict
print(dicto)
update
If you want iterate langs first
pages = ["L'arte della gioia", "Il nome della rosa"]
langs = ["it", "en"]
dicto = {}
for lang in langs:
wikipedia.set_lang(lang)
for page in pages:
page_lang = wikipedia.page(page)
chars_cnt = len(pagelang.content)
if page in dicto:
dicto[page][lang] = chars_cnt
else:
dicto[page] = {lang: chars_cnt}
print(dicto)

How do I return or print an attribute that's a math function between two attributes? Python

I'm very new to Python and have checked the three other posts about this subject but haven't been able to implement them successfully.
Essentially, I'm trying to return the name of the county with the highest voter turnout and the percentage. I can't seem to figure out how to return or print the latter part, as I don't have an attribute for the math portion (voters / population).
I've played around with some things like:
def percentage(self, turnout):
self.turnout = voters / population
Sorry if this post does not format correctly — this is all very new! Thanks in advance.
class County:
def __init__(self, name, population, voters):
self.name = name
self.population = population
self.voters = voters
def highest_turnout(data):
highest_county = data[0]
highest_percentage = (data[0].voters / data[0].population)
for county in data:
if (county.voters / county.population) > highest_percentage:
highest_county = county
highest_percentage = (county.voters / county.population)
return highest_county.name
# implement the function here
# your program will be evaluated using these objects
# it is okay to change/remove these lines but your program
# will be evaluated using these as inputs
allegheny = County("allegheny", 1000490, 645469) # this is an object
philadelphia = County("philadelphia", 1134081, 539069)
montgomery = County("montgomery", 568952, 399591)
lancaster = County("lancaster", 345367, 230278)
delaware = County("delaware", 414031, 284538)
chester = County("chester", 319919, 230823)
bucks = County("bucks", 444149, 319816)
data = [allegheny, philadelphia, montgomery, lancaster, delaware, chester, bucks]
result = highest_turnout(data) # do not change this line!
print(result) # prints the output of the function
# do not remove this line!
You simply return multiple values:
return highest_county.name, highest_percentage
In your calling program:
best_county, best_pct = highest_turnout(data)
What's very cool in python is you can actually just write the following:
highest_turnout = max(data, key=lambda county: county.voters / county.population)
Here, highest_turnout is the county with the highest turnout. What we've done is told python to calculate the maximum of the dataset where the values being compared is voters/population ie: the percentage of voters that came. In other words, this does exactly what your highest_turnout function does in a single line. You might consider defining a method for your County class called get_turnout() which just returns the percentage of the population that voted.
Obviously with highest_turnout we can write
highest_turnout.name
and
highest_turnout.voters / highest_turnout.population
to have the values you seek.

How do I consolidate my code to have one variable that changes for each input?

I'm building a program that will take the skillsets of different candidates for a given job, and check to see if they have the required skills. I have figured out how to make this work, but I don't know how to do it without writing "candidate1", "candidate2" etc. Is there a more efficient way to do this?:
list_of_qualities = ['Experience in Cold Calling', 'Experience in Door to
Door Sales', 'Experience in Account Management','Experience in Warm Leads','Experience in Presenting', 'Experience in Negotiation',\'Experience in Leadership', 'Experience in Closing']
cold_calling = list_of_qualities[0]
door_to_door = list_of_qualities[1]
account_management = list_of_qualities[2]
warm_leads = list_of_qualities[3]
presenting = list_of_qualities[4]
negotiation = list_of_qualities[5]
leadership = list_of_qualities[6]
closing = list_of_qualities[7]
required_qualities = [cold_calling, presenting, account_management, leadership, closing]
candidate1 = [cold_calling, presenting, account_management, leadership, closing, door_to_door]
candidate2 = [cold_calling, warm_leads, account_management, leadership]
candidate3 = [cold_calling, account_management]
matched_qualities1 = []
matched_qualities2 = []
matched_qualities3 = []
lacking_qualities1 = []
lacking_qualities2 = []
lacking_qualities3 = []
print("To view and apply for your job, candidates must have the following skillset:")
print(required_qualities)
print(" ")
print("The candidates have the following matching skills:")
Candidate 1
for i in candidate1:
if i in required_qualities:
matched_qualities1.append(i)
print("Candidate 1:", matched_qualities1)
for i in required_qualities:
if i not in candidate1:
lacking_qualities1.append(i)
Check if candidate 1 has all skills or not
if len(lacking_qualities1) == 0:
print(" This candidate has all of the required skills")
else:
print(" lacking:", lacking_qualities1)
There are several options. You can have a list of lists, or a dataframe. An array is also an option, but not really appropriate. Some other things to simplify your code:
You can do cold_calling,door_to_door,account_management,warm_leads,presenting,negotiation,
leadership,closing = list_of_qualities
Also, you can replace your for-loop with lacking_qualities1=[quality in required_qualities if not quality in candidate1]
If you create a list of candidates, you can do
lacking_qualities_list_of_list=[
[quality in required_qualities if not quality in candidate]
for candidate in list_of_candidates]

how do i divide list into smallers list

My list is formatted like:
gymnastics_school,participant_name,all-around_points_earned
I need to divide it up by schools but keep the scores.
import collections
def main():
names = ["gymnastics_school", "participant_name", "all_around_points_earned"]
Data = collections.namedtuple("Data", names)
data = []
with open('state_meet.txt','r') as f:
for line in f:
line = line.strip()
items = line.split(',')
items[2] = float(items[2])
data.append(Data(*items))
These are examples of how they're set up:
Lanier City Gymnastics,Ben W.,55.301
Lanier City Gymnastics,Alex W.,54.801
Lanier City Gymnastics,Sky T.,51.2
Lanier City Gymnastics,William G.,47.3
Carrollton Boys,Cameron M.,61.6
Carrollton Boys,Zachary W.,58.7
Carrollton Boys,Samuel B.,58.6
La Fayette Boys,Nate S.,63
La Fayette Boys,Kaden C.,62
La Fayette Boys,Cohan S.,59.1
La Fayette Boys,Cooper J.,56.101
La Fayette Boys,Avi F.,53.401
La Fayette Boys,Frederic T.,53.201
Columbus,Noah B.,50.3
Savannah Metro,Levi B.,52.801
Savannah Metro,Taylan T.,52
Savannah Metro,Jacob S.,51.5
SAAB Gymnastics,Dawson B.,58.1
SAAB Gymnastics,Dean S.,57.901
SAAB Gymnastics,William L.,57.101
SAAB Gymnastics,Lex L.,52.501
Suwanee Gymnastics,Colin K.,57.3
Suwanee Gymnastics,Matthew B.,53.201
After processing it should look like:
Lanier City Gymnastics:participants(4)
as it own list
Carrollton Boys(3)
as it own list
La Fayette Boys(6)
etc.
I would recommend putting them in dictionaries:
data = {}
with open('state_meet.txt','r') as f:
for line in f:
line = line.strip()
items = line.split(',')
items[2] = float(items[2])
if items[0] in data:
data[items[0]].append(items[1:])
else:
data[items[0]] = [items[1:]]
Then access schools could be done in the following way:
>>> data['Lanier City Gymnastics']
[['Ben W.',55.301],['Alex W.',54.801],['Sky T'.,51.2],['William G.',47.3]
EDIT:
Assuming you need the whole dataset as a list first, then you want to divide it into smaller lists you can generate the dictionary from the list:
data = []
with open('state_meet.txt','r') as f:
for line in f:
line = line.strip()
items = line.split(',')
items[2] = float(items[2])
data.append(items)
#perform median or other operation on your data
nested_data = {}
for items in data:
if items[0] in data:
data[items[0]].append(items[1:])
else:
data[items[0]] = [items[1:]]
nested_data[item[0]]
When you need to get a subset of a list you can use slicing:
mylist[start:stop:step]
where start, stop and step are optional (see link for more comprehensive introduction)

Categories

Resources