How do I join text if key value are the same - python

my code
import requests
request = requests.get("https://itspaudal-git.github.io/jsonapi/roku.json")
package_json = request.json()
menu = package_json['Chicago']['Menu']['Strawberry Pie']
for i in menu:
product = i['item']
weight = i['weight']
uom = i['uom']
container2products = {}
for j in i['tags']:
container = j['container']
container2products.setdefault(container,[])
container2products[container].append(product)
for container, products_list in container2products.items():
products_str = '&'.join(products_list)
print(products_str, container)
I was wondering if someone could point to me the right direction how to concatenate if key values are the same. My current output is
Whipping Cream 1 oz cup
Water tray 1
Cornstarch tray 1
Sugar 1 oz cup
fresh strawberries 20 oz cup
and I want it to be
Whipping Cream & Sugar 1 oz cup
Water & Constarch tray 1
fresh stawberries 20 oz cup

I have made a nfew changes to your code and also added some comment for undertsanding.
We are using a dictionary to store values instead of simple lists.
Hope it helps.
import requests
request = requests.get("https://itspaudal-git.github.io/jsonapi/roku.json")
package_json = request.json()
#This is dictionaru used to store items and thier quantities
items = {}
menu = package_json['Chicago']['Menu']['Strawberry Pie']
for i in menu:
product = i['item']
weight = i['weight']
uom = i['uom']
container2products = {}
for j in i['tags']:
container = j['container']
#add item in same key if present
if container in items:
items[container] = items[container] + [product]
else:
#add item in new key if not present
items[container] = [product]
#looping though the product quantities
for key in items:
print(" & ".join(items[key]), key)

Related

Loop scrapes the same page 20 times instead of iterating through range

I'm trying to scrape IMDB for a list of the top 1000 movies and get some details about them. However, when I run it, instead of getting the first 50 movies and going to the next page for the next 50, it repeats the loop and makes the same 50 entries 20 times in my database.
# Dataframe template
data = pd.DataFrame(columns=['ID','Title','Genre','Summary'])
#Get page data function
def getPageContent(start=1):
start = 1
url = 'https://www.imdb.com/search/title/?title_type=feature&year=1950-01-01,2019-12-31&sort=num_votes,desc&start='+str(start)
r = requests.get(url)
bs = bsp(r.text, "lxml")
return bs
#Run for top 1000
for start in range(1,1001,50):
getPageContent(start)
movies = bs.findAll("div", "lister-item-content")
for movie in movies:
id = movie.find("span", "lister-item-index").contents[0]
title = movie.find('a').contents[0]
genres = movie.find('span', 'genre').contents[0]
genres = [g.strip() for g in genres.split(',')]
summary = movie.find("p", "text-muted").find_next_sibling("p").contents
i = data.shape[0]
data.loc[i] = [id,title,genres,summary]
#Clean data
# data.ID = [float(re.sub('.','',str(i))) for i in data.ID] #remove . from ID
data.head(51)
0 1. The Shawshank Redemption [Drama] [\nTwo imprisoned men bond over a number of ye...
1 2. The Dark Knight [Action, Crime, Drama] [\nWhen the menace known as the Joker wreaks h...
2 3. Inception [Action, Adventure, Sci-Fi] [\nA thief who steals corporate secrets throug...
3 4. Fight Club [Drama] [\nAn insomniac office worker and a devil-may-...
...
46 47. The Usual Suspects [Crime, Drama, Mystery] [\nA sole survivor tells of the twisty events ...
47 48. The Truman Show [Comedy, Drama] [\nAn insurance salesman discovers his whole l...
48 49. Avengers: Infinity War [Action, Adventure, Sci-Fi] [\nThe Avengers and their allies must be willi...
49 50. Iron Man [Action, Adventure, Sci-Fi] [\nAfter being held captive in an Afghan cave,...
50 1. The Shawshank Redemption [Drama] [\nTwo imprisoned men bond over a number of ye...
Delete 'start' variable inside 'getPageContent' function. It assigns 'start=1' every time.
#Get page data function
def getPageContent(start=1):
url = 'https://www.imdb.com/search/title/?title_type=feature&year=1950-01-01,2019-12-31&sort=num_votes,desc&start='+str(start)
r = requests.get(url)
bs = bsp(r.text, "lxml")
return bs
I was not able to test this code. See inline comments for what I see as the main issue.
# Dataframe template
data = pd.DataFrame(columns=['ID', 'Title', 'Genre', 'Summary'])
# Get page data function
def getPageContent(start=1):
start = 1
url = 'https://www.imdb.com/search/title/?title_type=feature&year=1950-01-01,2019-12-31&sort=num_votes,desc&start=' + str(
start)
r = requests.get(url)
bs = bsp(r.text, "lxml")
return bs
# Run for top 1000
# for start in range(1, 1001, 50): # 50 is a
# step value so this gets every 50th movie
# Try 2 loops
start = 0
for group in range(0, 1001, 50):
for item in range(group, group + 50):
getPageContent(item)
movies = bs.findAll("div", "lister-item-content")
for movie in movies:
id = movie.find("span", "lister-item-index").contents[0]
title = movie.find('a').contents[0]
genres = movie.find('span', 'genre').contents[0]
genres = [g.strip() for g in genres.split(',')]
summary = movie.find("p", "text-muted").find_next_sibling("p").contents
i = data.shape[0]
data.loc[i] = [id, title, genres, summary]
# Clean data
# data.ID = [float(re.sub('.','',str(i))) for i in data.ID] #remove . from ID
data.head(51)

How to order a python dictionary containing a list of values

I'm not sure I am approaching this in the right way.
Scenario:
I have two SQL tables that contain rent information. One table contains rent due, and the other contains rent received.
I'm trying to build a rent book which takes the data from both tables for a specific lease and generates a date ordered statement which will be displayed on a webpage.
I'm using Python, Flask and SQL Alchemy.
I am currently learning Python, so I'm not sure if my approach is the best.
I've created a dictionary which contains the keys 'Date', 'Payment type' and 'Payment Amount', and in each of these keys I store a list which contains the data from my SQL queries. The bit im struggling on is how to sort the dictionary so it sorts by the date key, keeping the values in the other keys aligned to their date.
lease_id = 5
dates_list = []
type_list = []
amounts_list = []
rentbook_dict = {}
payments_due = Expected_Rent_Model.query.filter(Expected_Rent_Model.lease_id == lease_id).all()
payments_received = Rent_And_Fee_Income_Model.query.filter(Rent_And_Fee_Income_Model.lease_id == lease_id).all()
for item in payments_due:
dates_list.append(item.expected_rent_date)
type_list.append('Rent Due')
amounts_list.append(item.expected_rent_amount)
for item in payments_received:
dates_list.append(item.payment_date)
type_list.append(item.payment_type)
amounts_list.append(item.payment_amount)
rentbook_dict.setdefault('Date',[]).append(dates_list)
rentbook_dict.setdefault('Type',[]).append(type_list)
rentbook_dict.setdefault('Amount',[]).append(amounts_list)
I was then going to use a for loop within the flask template to iterate through each value and display it in a table on the page.
Or am I approaching this in the wrong way?
so I managed to get this working just using zipped list. Im sure there is a better way for me to accomplish this but im pleased I've got it working.
lease_id = 5
payments_due = Expected_Rent_Model.query.filter(Expected_Rent_Model.lease_id == lease_id).all()
payments_received = Rent_And_Fee_Income_Model.query.filter(Rent_And_Fee_Income_Model.lease_id == lease_id).all()
total_due = 0
for debit in payments_due:
total_due = total_due + int(debit.expected_rent_amount)
total_received = 0
for income in payments_received:
total_received = total_received + int(income.payment_amount)
balance = total_received - total_due
if balance < 0 :
arrears = "This account is in arrears"
else:
arrears = ""
dates_list = []
type_list = []
amounts_list = []
for item in payments_due:
dates_list.append(item.expected_rent_date)
type_list.append('Rent Due')
amounts_list.append(item.expected_rent_amount)
for item in payments_received:
dates_list.append(item.payment_date)
type_list.append(item.payment_type)
amounts_list.append(item.payment_amount)
payment_data = zip(dates_list, type_list, amounts_list)
sorted_payment_data = sorted(payment_data)
tuples = zip(*sorted_payment_data)
list1, list2, list3 = [ list(tuple) for tuple in tuples]
return(render_template('rentbook.html',
payment_data = zip(list1,list2,list3),
total_due = total_due,
total_received = total_received,
balance = balance))

BeautifulSoup trying to get text from wrapped divs but empty or "none" is being returned

Here is a picture (sorry) of the HTML that I am trying to parse:
I am using this line:
home_stats = soup.select_one('div', class_='statText:nth-child(1)').text
Thinking that I'd get the 1st child of the class statText and the outcome would be 53%.
But it's not. I get "Loading..." and none of the data that I was trying to use and display.
The full code I have so far:
soup = BeautifulSoup(source, 'lxml')
home_team = soup.find('div', class_='tname-home').a.text
away_team = soup.find('div', class_='tname-away').a.text
home_score = soup.select_one('.current-result .scoreboard:nth-child(1)').text
away_score = soup.select_one('.current-result .scoreboard:nth-child(2)').text
print("The home team is " + home_team, "and they scored " + home_score)
print()
print("The away team is " + away_team, "and they scored " + away_score)
home_stats = soup.select_one('div', class_='statText:nth-child(1)').text
print(home_stats)
Which currently does print the hone and away team and the number of goals they scored. But I can't seem to get any of the statistical content from this site.
My output plan is to have:
[home_team] had 53% ball possession and [away_team] had 47% ball possession
However, I would like to remove the "%" symbols from the parse (but that's not essential). My plan is to use these numbers for more stats later on, so the % symbol gets in the way.
Apologies for the noob question - this is the absolute beginning of my Pythonic journey. I have scoured the internet and StackOverflow and just can not find this situation - I also possibly don't know exactly what I am looking for either.
Thanks kindly for your help! May your answer be the one I pick as "correct" ;)
Assuming that this is the website that u r tryna scrape, here is the complete code to scrape all the stats:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
driver = webdriver.Chrome('chromedriver.exe')
driver.get('https://www.scoreboard.com/en/match/SO3Fg7NR/#match-statistics;0')
pg = driver.page_source #Gets the source code of the page
driver.close()
soup = BeautifulSoup(pg,'html.parser') #Creates a soup object
statrows = soup.find_all('div',class_ = "statTextGroup") #Finds all the div tags with class statTextGroup -- these div tags contain the stats
#Scrapes the team names
teams = soup.find_all('a',class_ = "participant-imglink")
teamslst = []
for x in teams:
team = x.text.strip()
if team != "":
teamslst.append(team)
stats_dict = {}
count = 0
for x in statrows:
txt = x.text
final_txt = ""
stat = ""
alphabet = False
percentage = False
#Extracts the numbers from the text
for c in txt:
if c in '0123456789':
final_txt+=c
else:
if alphabet == False:
final_txt+= "-"
alphabet = True
if c != "%":
stat += c
else:
percentage = True
values = final_txt.split('-')
#Appends the values to the dictionary
for x in values:
if stat in stats_dict.keys():
if percentage == True:
stats_dict[stat].append(x + "%")
else:
stats_dict[stat].append(int(x))
else:
if percentage == True:
stats_dict[stat] = [x + "%"]
else:
stats_dict[stat] = [int(x)]
count += 1
if count == 15:
break
index = [teamslst[0],teamslst[1]]
#Creates a pandas DataFrame out of the dictionary
df = pd.DataFrame(stats_dict,index = index).T
print(df)
Output:
Burnley Southampton
Ball Possession 53% 47%
Goal Attempts 10 5
Shots on Goal 2 1
Shots off Goal 4 2
Blocked Shots 4 2
Free Kicks 11 10
Corner Kicks 8 2
Offsides 2 1
Goalkeeper Saves 0 2
Fouls 8 10
Yellow Cards 1 0
Total Passes 522 480
Tackles 15 12
Attacks 142 105
Dangerous Attacks 44 29
Hope that this helps!
P.S: I actually wrote this code for a different question, but I didn't post it as an answer was already posted! But I didn't know that it would come in handy now! Anyways, I hope that my answer does what u need.

Continued Difficulties with Python Question

I asked this question earlier and am still having difficulties. I tried a new approach that isn't working. Essentially, I'm trying to implement a program that performs a calculation using Python objects to represent data. I want to determine the name of the county that had the highest voter turnout in a previous election, as well as the percentage of the population who voted. I need to use two function names, but can manipulate them however I see fit. Here's what I currently have and not sure what mistake I'm making here:
#creating a dictionary to store the country name and its percentage
data = {}
#creating the class county
class County:
def __init__(self,county,population,voters):
self.country = country
self.voters = voters
self.population = population
self.sorted_data = ""
self.formatted_percentage = ""
def highest_turnout(data) :
highest = data[0]
highest_percent = (data[0].voters / data[0].population)
for data in County
if (County.voters / County.population) > highest_percent
highest = County
highest_percent = County.data
allegheny = County("allegheny", 1000490, 645469)
philadelphia = County("philadelphia", 1134081, 539069)
montgomery = County("montgomery", 568952, 399591)
lancaster = County("lancaster", 345367, 230278)
delaware = County("delaware", 414031, 284538)
chester = County("chester", 319919, 230823)
bucks = County("bucks", 444149, 319816)
I need the “highest_turnout” function to do this:
Find the County that has the highest turnout, i.e. the highest percentage of the
population who voted, using the objects’ population and voters attributes
Return a tuple containing the name of the County with the highest turnout and the
percentage of the population who voted, in that order; the percentage should be
represented as a number between 0 and 1
Display the results of any “print” functions, as well as the last one which prints the return value of the function. Note that your highest_turnout function should correctly determine the County with the highest turnout for any input list
Any explanations / advice on how to approach this would be greatly appreciated. Thank you as I'm pretty new to Python and want to learn as much as possible.
#creating a list to store the country name and its percentage
data = []
#creating the class county
class County:
def __init__(self,county,population,voters):
self.county = county
self.voters = voters
self.population = population
self.sorted_data = ""
self.formatted_percentage = ""
def highest_turnout(data) :
sorted_data_by_turnout = sorted(data, key=lambda county: county.voters / county.population, reverse=True)
highest_turnout_county = sorted_data_by_turnout[0]
return highest_turnout_county.county, (highest_turnout_county.voters / highest_turnout_county.population)
data = []
data.append(County("allegheny", 1000490, 645469))
data.append(County("philadelphia", 1134081, 539069))
data.append(County("montgomery", 568952, 399591))
data.append(County("lancaster", 345367, 230278))
data.append(County("delaware", 414031, 284538))
data.append(County("chester", 319919, 230823))
data.append(County("bucks", 444149, 319816))
print(highest_turnout(data))
FYI: there were some indentation errors in your code

combine two for loops in to fill same dictionary

I am trying to get two different merchants from a list of dictionaries with priority to merchants who have prices,if no two different merchants are found with prices, merchant 1 or 2 prices are to be filled with data from list,if list is not enough merchant 1 or 2 should be None.
I.e the for loop will return two merchants,priority to merchants who have prices, if that is not enough to fill merchants (1 or 2) get merchants with no prices.finally if still merchant 1 or 2 not created fill them with None value.
here is the code I have so far, it does the job but I believe it can be combined in a more Pythonic way.
import csv
with open('/home/timmy/testing/example/example/test.csv') as csvFile:
reader=csv.DictReader(csvFile)
for row in reader:
dummy_list.append(row)
item=dict()
index = 1
for merchant in dummy_list:
if merchant['price']:
if index==2:
if item['merchant_1']==merchant['name']:
continue
item['merchant_%d'%index] = merchant['name']
item['merchant_%d_price'%index] = merchant['price']
item['merchant_%d_stock'%index] = merchant['stock']
item['merchant_%d_link'%index] = merchant['link']
if index==3:
break
index+=1
for merchant in dummy_list:
if index==3:
break
if index<3:
try:
if item['merchant_1']==merchant['name']:
continue
except KeyError:
pass
item['merchant_%d'%index] = merchant['name']
item['merchant_%d_price'%index] = merchant['price']
item['merchant_%d_stock'%index] = merchant['stock']
item['merchant_%d_link'%index] = merchant['link']
index+=1
while index<3:
item['merchant_%d'%index] = ''
item['merchant_%d_price'%index] = ''
item['merchant_%d_stock'%index] = ''
item['merchant_%d_link'%index] = ''
index+=1
print(item)
here is the contents of the csv file:
price,link,name,stock
,https://www.samsclub.com/sams/donut-shop-100-ct-k-cups/prod19381344.ip,Samsclub,
,https://www.costcobusinessdelivery.com/Green-Mountain-Original-Donut-Shop-Coffee%2C-Medium%2C-Keurig-K-Cup-Pods%2C-100-ct.product.100297848.html,Costcobusinessdelivery,
,https://www.costco.com/The-Original-Donut-Shop%2C-Medium-Roast%2C-K-Cup-Pods%2C-100-count.product.100381350.html,Costco,
57.99,https://www.target.com/p/the-original-donut-shop-regular-medium-roast-coffee-keurig-k-cup-pods-108ct/-/A-13649874,Target,Out of Stock
10.99,https://www.target.com/p/the-original-donut-shop-dark-roast-coffee-keurig-k-cup-pods-18ct/-/A-16185668,Target,In Stock
,https://www.homedepot.com/p/Keurig-Kcup-Pack-The-Original-Donut-Shop-Coffee-108-Count-110030/204077166,Homedepot,Undertermined
As you only want to keep at most 2 merchants, I would process the csv only once keeping separately a list of merchant with prices and a list of merchant without prices, stopping as soon as 2 merchant with prices have been found.
After that loop, I would concatenate those 2 list and a list of two empty merchants and take the first 2 elements of that. That will be enough to guarantee your requirements of 2 distinct merchants with priority to those having prices. Finaly, I would use that to fill the item dict.
Code would be:
import csv
with open('/home/timmy/testing/example/example/test.csv') as csvFile:
reader=csv.DictReader(csvFile)
names_price = set()
names_no_price = set()
merchant_price = []
merchant_no_price = []
item = {}
for merchant in reader:
if merchant['price']:
if not merchant['name'] in names_price:
names_price.add(merchant['name'])
merchant_price.append(merchant)
if len(merchant_price) == 2:
break;
else:
if not merchant['name'] in names_no_price:
names_no_price.add(merchant['name'])
merchant_no_price.append(merchant)
void = { k: '' for k in reader.fieldnames}
merchant_list = (merchant_price + merchant_no_price + [void, void.copy()])[:2]
for index, merchant in enumerate(merchant_list, 1):
item['merchant_%d'%index] = merchant['name']
item['merchant_%d_price'%index] = merchant['price']
item['merchant_%d_stock'%index] = merchant['stock']
item['merchant_%d_link'%index] = merchant['link']

Categories

Resources