I'm looking to categorize some sentences. To do this, I've created a couple dictionary categories for "Price" and "Product Quality". So far I have the code loop through the words within the category and it displays the word it found.
I'd also like to add the actual category name like "Price" or "Product Quality" depending on the values within those keys.
Is there a way to display the keys for each category. Currently it's just displaying both "Price" and "Product Quality" for everything.
Here is the code:
data = ["Great price on the dewalt saw", "cool deal and quality", "love it! and the price percent off", "definitely going to buy"]
words = {'price': ['price', 'compare', '$', 'percent', 'money', '% off'],
'product_quality': ['quality', 'condition', 'aspect']}
for d in data:
for word in words.values():
for s in word:
if s in d:
print(id(d), ", ", d, ", ", s, ", ", words.keys())
Here is the output as well:
4398300496 , Great price on the dewalt saw , price , dict_keys(['price', 'product_quality'])
4399544552 , cool deal and quality , quality , dict_keys(['price', 'product_quality'])
4398556680 , love it! and the price percent off , price , dict_keys(['price', 'product_quality'])
4398556680 , love it! and the price percent off , percent , dict_keys(['price', 'product_quality'])
You can use items(), which unpacks into (key, value):
data = ["Great price on the dewalt saw", "cool deal and quality", "love it! and the price percent off", "definitely going to buy"]
words = {'price': ['price', 'compare', '$', 'percent', 'money', '% off'],
'product_quality': ['quality', 'condition', 'aspect']}
for d in data:
for category, word in words.items():
for s in word:
if s in d:
print(id(d), ", ", d, ", ", s, ", ", category)
Out:
(4338487344, ', ', 'Great price on the dewalt saw', ', ', 'price', ', ', 'price')
(4338299376, ', ', 'cool deal and quality', ', ', 'quality', ', ', 'product_quality')
(4338487416, ', ', 'love it! and the price percent off', ', ', 'price', ', ', 'price')
(4338487416, ', ', 'love it! and the price percent off', ', ', 'percent', ', ', 'price')
Related
I am trying to scrape PFF.com for football grades with selenium, I am trying to get a specific grade for all Quarterbacks. Problem is, it doesn't seem like it's capturing the text as .text isn't working but I am not getting any NoSuchElementException.
Here's my code:
service = Service(executable_path="C:\\chromedriver.exe")
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=op)
driver.get("https://premium.pff.com/nfl/positions/2022/REG/passing?position=QB")
sleep(2)
sign_in = driver.find_element(By.XPATH, '/html/body/div/div/header/div[3]/button')
sign_in.click()
sleep(2)
email = driver.find_element(By.XPATH, '/html/body/div/div/div/div/div/div/form/div[1]/input')
email.send_keys(my_email)
password = driver.find_element(By.XPATH,
'/html/body/div/div/div/div/div/div/form/div[2]/input')
password.send_keys(my_password)
sleep(2)
sign_in_2 = driver.find_element(By.XPATH,
'/html/body/div/div/div/div/div/div/form/button')
sign_in_2.click()
sleep(2)
all_off_grades = driver.find_elements(By.CSS_SELECTOR, '.kyber-table
.kyber-grade-badge__info-text div')
all_qb_names = driver.find_elements(By.CSS_SELECTOR, '.kyber-table .p-1 a')
qb_grades = []
qb_names = []
for grade in all_off_grades:
qb_grades.append(grade.text)
for qb_name in all_qb_names:
qb_names.append(qb_name.text)
print(qb_grades)
print(qb_names)
The lists keep showing as empty.
Here are the elements I am trying to pull, but for every QB, I already confirmed the other QB's have the same class names for their grade and name.
<div class="kyber-grade-badge__info-text">91.5</div>
need to pull the 91.5
<a class="p-1" href="/nfl/players/2022/REG/josh-allen/46601/passing">Josh Allen</a>
need to pull Josh Allen
#Jbuck3 I tried modifying the locator and it works for me. I am also giving the output I am getting. Let me know that is what you were expecting.
all_off_grades = driver.find_elements(By.CSS_SELECTOR, '.kyber-table-body__scrolling-rows-container .kyber-grade-badge__info-text')
all_qb_names = driver.find_elements(By.CSS_SELECTOR, "a[data-gtm-id = 'player_name']")
And the output I got is:
['91.5', '90.3', '74.6', '-', '-', '60.0', '84.3', '78.3', '78.1', '-', '-', '60.0', '82.8', '83.4', '-', '-', '-', '60.0']
['Josh Allen ', 'Geno Smith ', 'Kirk Cousins ', 'Marcus Mariota ', 'Jameis Winston ', 'Trey Lance ', 'Derek Carr ', 'Justin Fields ', 'Trevor Lawrence ', 'Russell Wilson ', 'Ryan Tannehill ', 'Tom Brady ', 'Tua Tagovailoa ', 'Mac Jones ', 'Davis Mills ', 'Matthew Stafford ', 'Baker Mayfield ', 'Lamar Jackson ', 'Joe Flacco ', 'Matt Ryan ', 'Jalen Hurts ', 'Daniel Jones ', 'Kyler Murray ', 'Justin Herbert ', 'Joe Burrow ', 'Aaron Rodgers ', 'Patrick Mahomes ', 'Mitchell Trubisky ', 'Dak Prescott ', 'Jacoby Brissett ', 'Carson Wentz ', 'Jared Goff ']
I made a dictionary using .groupdict() function, however, I am having a problem regarding elimination of certain output dictionaries.
For example my code looks like this (tweet is a string that contains 5 elements separated by || :
def somefuntion(pattern,tweet):
pattern = "^(?P<username>.*?)(?:\|{2}[^|]+){2}\|{2}(?P<botprob>.*?)(?:\|{2}|$)"
for paper in tweet:
for item in re.finditer(pattern,paper):
item.groupdict()
This produces an output in the form:
{'username': 'yashrgupta ', 'botprob': ' 0.30794588629999997 '}
{'username': 'sterector ', 'botprob': ' 0.39391528649999996 '}
{'username': 'MalcolmXon ', 'botprob': ' 0.05630123819 '}
{'username': 'ryechuuuuu ', 'botprob': ' 0.08492567222000001 '}
{'username': 'dpsisi ', 'botprob': ' 0.8300337045 '}
But I would like it to only return dictionaries whose botprob is above 0.7. How do I do this?
Specifically, as #WiktorStribizew notes, just skip iterations you don't want:
pattern = "^(?P<username>.*?)(?:\|{2}[^|]+){2}\|{2}(?P<botprob>.*?)(?:\|{2}|$)"
for paper in tweet:
for item in re.finditer(pattern,paper):
item = item.groupdict()
if item["botprob"] < 0.7:
continue
print(item)
This could be wrapped in a generator expression to save the explicit continue, but there's enough going on as it is without making it harder to read (in this case).
UPDATE since you are apparently in a function:
pattern = "^(?P<username>.*?)(?:\|{2}[^|]+){2}\|{2}(?P<botprob>.*?)(?:\|{2}|$)"
items = []
for paper in tweet:
for item in re.finditer(pattern,paper):
item = item.groupdict()
if float(item["botprob"]) > 0.7:
items.append(item)
return items
Or using comprehensions:
groupdicts = (item.groupdict() for paper in tweet for item in re.finditer(pattern, paper))
return [item for item in groupdicts if float(item["botprob"]) > 0.7]
I would like it to only return dictionaries whose botprob is above 0.7.
entries = [{'username': 'yashrgupta ', 'botprob': ' 0.30794588629999997 '},
{'username': 'sterector ', 'botprob': ' 0.39391528649999996 '},
{'username': 'MalcolmXon ', 'botprob': ' 0.05630123819 '},
{'username': 'ryechuuuuu ', 'botprob': ' 0.08492567222000001 '},
{'username': 'dpsisi ', 'botprob': ' 0.8300337045 '}]
filtered_entries = [e for e in entries if float(e['botprob'].strip()) > 0.7]
print(filtered_entries)
output
[{'username': 'dpsisi ', 'botprob': ' 0.8300337045 '}]
I'm trying to write a nested dictionary to a CSV file and running into issues; either the file doesn't write anything, or it errors out.
The dictionary looks something like this:
finalDict = 'How would you rate the quality of the product?': [{'10942625544': 'High '
'quality'},
{'10942625600': 'Neither '
'high nor '
'low '
'quality'},
{'10942625675': 'Neither '
'high nor '
'low '
'quality'},
{'10942625736': 'Very high '
'quality'},
{'10942625788': 'Neither '
'high nor '
'low '
'quality'},
{'10942625827': 'Neither '
'high nor '
'low '
'quality'},
{'10942625878': 'Neither '
'high nor '
'low '
'quality'},
{'10942625932': 'High '
'quality'},
{'10942625977': 'High '
'quality'},
{'10942626027': 'Neither '
'high nor '
'low '
'quality'},
{'10942626071': 'High '
'quality'},
{'10942626128': 'High '
'quality'},
{'10942626180': 'Very high '
'quality'},
{'10942626227': 'Very high '
'quality'},
{'10942626278': 'High '
'quality'},
{'10942626332': 'Low '
'quality'},
{'10942626375': 'Very high '
'quality'},
{'10942626430': 'Low '
'quality'},
{'10942626492': 'Low '
'quality'}],
'How would you rate the value for money of the product?': [{'10942625544': 'Above '
'average'},
{'10942625600': 'Below '
'average'},
{'10942625675': 'Average'},
{'10942625736': 'Excellent'},
{'10942625788': 'Above '
'average'},
{'10942625827': 'Below '
'average'},
{'10942625878': 'Average'},
{'10942625932': 'Average'},
{'10942625977': 'Above '
'average'},
{'10942626027': 'Above '
'average'},
{'10942626071': 'Above '
'average'},
{'10942626128': 'Average'},
{'10942626180': 'Excellent'},
{'10942626227': 'Average'},
{'10942626278': 'Average'},
{'10942626332': 'Below '
'average'},
{'10942626375': 'Excellent'},
{'10942626430': 'Poor'},
{'10942626492': 'Below '
'average'}],
I've tried working off of Write Nested Dictionary to CSV but am struggling to adapt it to my specific case.
My code currently looks like:
def writeToCsv(finalDict):
csv_columns = ['Question', 'UserID', 'Answer']
filename = "output.csv"
with open(filename, "w") as filename:
w = csv.DictWriter(filename, fieldnames=csv_columns)
w.writeheader()
for data in finalDict: #where I'm stuck
Any recommendations would be appreciated!
This is an option:
def writeToCsv(finalDict):
csv_columns = ['Question', 'UserID', 'Answer']
filename = "output.csv"
with open(filename, "w") as fl:
w = csv.DictWriter(fl, fieldnames=csv_columns, lineterminator='\n')
w.writeheader()
for question, data in finalDict.items()
for item in data:
for user, answer in item.items():
w.writerow(dict(zip(csv_columns, (question, user, answer))))
for question, data in finalDict.items():
for resp in data:
row = {'Question': question,
'UserID': list(resp.keys())[0],
'Answer': list(resp.values())[0]}
w.writerow(row)
def exportOrders(self):
file = open("orders.txt", 'w')
file.write("\"Date\" \"Pair\" \"Amount bought/sold\" \"Pair Price\" \"Profit/Loss\" \"Order Type\"" + '\n')
for x in self.tradeHistory:
date = x['date']
pair = self.currentPair
amount = x[self.currentPair]
price = x['price']
order = x['Order Type']
if order == "buy":
spent = x['spent']
file.write(date + ' ' + pair + ' ' + amount + ' '
+ price + ' ' + float(-spent) + ' ' + order + ' \n')
if order == "sell":
obtained = x['obtained']
file.write(date + ' ' + pair + ' ' + amount + ' '
+ price + ' ' + obtained + ' ' + order + ' \n')
file.close()
self.tradeHistory is a list of dictionaries that store a date, a pair, the amount bought, the price of the pair, the money spent or obtained, and the order type.
My problem is that when the program runs for the first time into:
if order == "buy":
spent = x['spent']
file.write(date + ' ' + pair + ' ' + amount + ' '
+ price + ' ' + str(float(-spent)) + ' ' + order + ' \n')
The for loop breaks out and the orders.txt only shows the first line which is:
file.write("\"Date\" \"Pair\" \"Amount bought/sold\" \"Pair Price\" \"Profit/Loss\" \"Order Type\"" + '\n')
Thank you in advance!
edit:
Basically, my self.tradeHistory has the following content
{'date': 1505161800, 'BTC_ETH': 0.7091196761422075, 'price': 0.07050996, 'spent': 0.05, 'Order Type': 'buy'}
{'date': 1505167200, 'BTC_ETH': 0.7091196761422075, 'price': 0.07079909, 'obtained': 0.050205027771963, 'Order Type': 'sell'}
{'date': 1505236500, 'BTC_ETH': 0.7032346826344071, 'price': 0.07110002, 'spent': 0.05, 'Order Type': 'buy'}
{'date': 1505251800, 'BTC_ETH': 0.7032346826344071, 'price': 0.0707705, 'obtained': 0.04976827010737831, 'Order Type': 'sell'}
{'date': 1505680200, 'BTC_ETH': 0.715374411944349, 'price': 0.06989347, 'spent': 0.05, 'Order Type': 'buy'}
{'date': 1505699100, 'BTC_ETH': 0.715374411944349, 'price': 0.071989, 'obtained': 0.05149908854146174, 'Order Type': 'sell'}
{'date': 1505733300, 'BTC_ETH': 0.6879187705515734, 'price': 0.072683, 'spent': 0.05, 'Order Type': 'buy'}
{'date': 1505745000, 'BTC_ETH': 0.6889021311187427, 'price': 0.07257925, 'spent': 0.05, 'Order Type': 'buy'}
{'date': 1505756700, 'BTC_ETH': 1.3768209016703161, 'price': 0.0732, 'obtained': 0.10078329000226714, 'Order Type': 'sell'}
...
There are 63 items inside the list of dictionaries. My aim is to create a .txt file that looks like
"Date" "Pair" "Amount bought/sold" "Pair Price" "Profit/Loss" "Order Type"
1505161800 BTC_ETH 0.7091196761422075 0.07050996 0.05 buy
1505167200 BTC_ETH 0.7091196761422075 0.07079909 0.05 sell
...
You should not concatenate numbers with strings in Python. Use str.format instead:
file.write(
'{} {} {} {} {} {}\n'
.format(date, pair, amount, price, float(-spent), order)
)
You can also use csv module for a better implementation.
import csv
def exportOrders(self):
with open("orders.txt", 'w') as file:
writer = csv.writer(file, delimiter=' ', quotechar='"')
writer.writerow([
'Date', 'Pair', 'Amount bought/sold', 'Pair Price',
'Profit/Loss', 'Order Type'])
for x in self.tradeHistory:
date = x['date']
pair = self.currentPair
amount = x[self.currentPair]
price = x['price']
order = x['Order Type']
if order == "buy":
spent = x['spent']
writer.writerow([
date, pair, amount, price,
float(-spent), order])
if order == "sell":
obtained = x['obtained']
writer.writerow([
date, pair, amount, price,
obtained, order])
I am using TfidfVectorizer from scikit-learn to extract features,
And the settings are:
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = []
for token in tokens:
token = re.sub("[^a-zA-Z]","", token)
stems.append(EnglishStemmer().stem(token))
return stems
vectorizer = TfidfVectorizer(tokenizer=tokenize, lowercase=True, stop_words='english')
After feeding the training set to the vectorizer, I call
vectorizer.get_feature_names()
the output contains some duplicate words with space: e.g.
u'', u' ', u' low', u' lower', u'lower', u'lower ', u'lower high', u'lower low'
And the acceptable output should be:
u'low', u'lower', u'lower high', u'lower low'
How can I solve that? Thank you.
You could do like the below,
>>> l = ['lower low', 'lower high','lower ', ' lower', u'lower', ' ', '', 'low']
>>> list(set(i.strip() for i in l if i!=' ' and i))
['lower', 'lower low', 'lower high', 'low']