Web Scraping For Loop Issue - python

i have an issue with collecting all the data in the website. when i run my code it only prints out the first entry. And it should print out every Song, Artist, and Rank. Also It doesn't show on Csv.
from bs4 import BeautifulSoup
import requests
import csv
my_url = "https://www.billboard.com/charts/hot-100"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/87.0.4280.141 Safari/537.36'
}
r = requests.get(my_url)
page_soup = BeautifulSoup(r.content, 'lxml')
filename = "Billboard100.csv"
csv_writer = csv.writer(open(filename, 'w'))
Chart = page_soup.findAll('ol', class_='chart-list__elements')
BB = []
for item in Chart:
Song = item.find('span', class_='chart-element__information__song text--truncate color--primary').text.strip()
Artist = item.find('span', class_='chart-element__information__artist text--truncate color--secondary').text.strip()
Rank = item.find('span', class_='chart-element__rank__number').text.strip()
Billboard = {
'Song': Song,
'Artist': Artist,
'Rank': Rank,
}
BB.append(Billboard)
print(BB)
with open("Billboard100.csv", "w",
newline="") as infile:
writer = csv.writer(infile)

In your code Char length was one. Use Chart = page_soup.find_all('li', {'class': 'chart-list__element display--flex'}) to select all the entity.
from bs4 import BeautifulSoup
import requests
import csv
my_url = "https://www.billboard.com/charts/hot-100"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
r = requests.get(my_url)
page_soup = BeautifulSoup(r.content, 'lxml')
filename = "Billboard100.csv"
csv_writer = csv.writer(open(filename, 'w'))
Chart = page_soup.find_all('li', {'class': 'chart-list__element display--flex'})
BB = []
for item in Chart:
Song = item.find('span', class_='chart-element__information__song text--truncate color--primary').text.strip()
Artist = item.find('span', class_='chart-element__information__artist text--truncate color--secondary').text.strip()
Rank = item.find('span', class_='chart-element__rank__number').text.strip()
Billboard = {
'Song': Song,
'Artist': Artist,
'Rank': Rank,
}
BB.append(Billboard)
print(BB)
with open("Billboard100.csv", "w",newline="") as infile:
writer = csv.writer(infile)
for row in BB:
writer.writerow([row])

Your code isn't well indented. In python, always check your indentation if some code doesn't run at all. official doc reference
for item in Chart:
Song = item.find('span', class_='chart-element__information__song text--truncate color--primary').text.strip()
Artist = item.find('span', class_='chart-element__information__artist text--truncate color--secondary').text.strip()
Rank = item.find('span', class_='chart-element__rank__number').text.strip()
Billboard = {
should be
for item in Chart:
Song = item.find('span', class_='chart-element__information__song text--truncate color--primary').text.strip()
Artist = item.find('span', class_='chart-element__information__artist text--truncate color--secondary').text.strip()
Rank = item.find('span', class_='chart-element__rank__number').text.strip()
Billboard = {

Related

Can you scrape multiple stock prices on a recurring schedule?

With the current code, I can scrape multiple prices, but it doesn't automatically re-scrape them every 2 minutes which is what I need.
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
mystocks = ['GOOG', 'META', 'MSFT', 'PLTR', 'TSLA', 'ZS', 'PYPL', 'SHOP', 'TTCF']
stockdata = []
def getData(symbol):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
stock = {
'symbol': symbol,
'price': soup.find('div', {'class':'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
}
return stock
for item in mystocks:
stockdata.append(getData(item))
def export_data(stockdata):
df = pd.DataFrame(stockdata)
df.to_excel("LETS GO2.xlsx")
if __name__ == '__main__':
while True:
getData(item)
export_data(stockdata)
time_wait = 2
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait * 60)
Your for-loop is at the wrong place.
Try to put it in your while True: block to loop over every ticker every two minutes.
EDIT:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
mystocks = ['GOOG', 'META', 'MSFT', 'PLTR', 'TSLA', 'ZS', 'PYPL', 'SHOP', 'TTCF']
def getData(symbol):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
stock = {
'symbol': symbol,
'price': soup.find('div', {'class': 'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
}
return stock
def export_data(stockdata):
df = pd.DataFrame(stockdata)
df.to_excel("LETS GO2.xlsx")
if __name__ == "__main__":
while True:
stockdata = []
for item in mystocks:
print(item)
stockdata.append(getData(item))
export_data(stockdata)
time_wait = 0.1
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait * 60)

Why is this web scrape not working on python?

I haven’t recently been using the code attached. For the past few weeks, it has been working completely fine and always produced results. However, I used this today and for some reason it didn’t work. Could you please help and provide a solution to the problem.
import requests, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {"q": "dji", "hl": "en", 'gl': 'us', 'tbm': 'shop'}
response = requests.get("https://www.google.com/search",
params=params,
headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
# list with two dict() combined
shopping_data = []
shopping_results_dict = {}
for shopping_result in soup.select('.sh-dgr__content'):
title = shopping_result.select_one('.Lq5OHe.eaGTj h4').text
product_link = f"https://www.google.com{shopping_result.select_one('.Lq5OHe.eaGTj')['href']}"
source = shopping_result.select_one('.IuHnof').text
price = shopping_result.select_one('span.kHxwFf span').text
try:
rating = shopping_result.select_one('.Rsc7Yb').text
except:
rating = None
try:
reviews = shopping_result.select_one('.Rsc7Yb').next_sibling.next_sibling
except:
reviews = None
try:
delivery = shopping_result.select_one('.vEjMR').text
except:
delivery = None
shopping_results_dict.update({
'shopping_results': [{
'title': title,
'link': product_link,
'source': source,
'price': price,
'rating': rating,
'reviews': reviews,
'delivery': delivery,
}]
})
shopping_data.append(dict(shopping_results_dict))
print(title)
Because .select in for shopping_result in soup.select('.sh-dgr__content'): could not find any element so it gives you an empty list. Therefor the body of the for-loop is not executed. Python jumps out of the loop.
title only exists and is defined when the body of the for loop executes.
You should make sure you used a correct method to find your element(s).

Create Rows and Columns in BeautifulSoup

Below is code python code output.I want output in rows and column in dataframe:
response = requests.get(source_data)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
for item in States :
state_name = item.find(class_='fw-bold fs-5 mb-2').text
vaccinated_per = item.find(class_='col-3 text-end fs-5 ff-s text-success').text
print(state_name,vaccinated_per)
Output:
Flanders 80.24%
Wallonia 70.00%
Brussels 56.73%
Ostbelgien 65.11%
Collect your information in a list of dicts and then simply create a data frame from it:
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get('https://covid-vaccinatie.be/en', headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Output
state_name vaccinated_per
0 Flanders 80.24%
1 Wallonia 70.00%
2 Brussels 56.73%
3 Ostbelgien 65.11%

Web scraper returning zero in terminal

I'm trying to scrape this website
https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who=bygg&bf=1&page=1
And I've put a def getQuestions(tag) in the who={tag} part of the url and that works fine. When I try to add def getQuestions(tag, page) page={page} it just returns 0 in the terminal, and I really hope no clue what could be causing this.
Here is the full code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
questionlist = []
def getQuestions(tag, page):
url = 'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={bygg}&bf=1&page={page}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
questions = soup.find_all('div', {'class': 'box-white p-0 mb-4'})
for item in questions:
question = {
'title': item.find('a', {'class': 'link-primary'}).text,
'link': item.find('a', {'class': 'link-primary'})['href'],
'nummer': item.find('a', {'class': 'link-body'})['href'],
'address': item.find('address', {'class': 'mt-2 mb-0'}).text,
'RegÅr': item.find('div', {'class': 'col text-center'}).text,
}
questionlist.append(question)
return
for x in range(1,5):
getQuestions('bygg', x)
print(len(questionlist))
Any help would be appreciated. Best regards!
Change the string in url variable to f-string:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
def getQuestions(tag, page):
questionlist = []
url = f"https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}"
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
questions = soup.find_all("div", {"class": "box-white p-0 mb-4"})
for item in questions:
question = {
"title": item.find("a", {"class": "link-primary"}).text,
"link": item.find("a", {"class": "link-primary"})["href"],
"nummer": item.find("a", {"class": "link-body"})["href"],
"address": item.find("address", {"class": "mt-2 mb-0"}).text,
"RegÅr": item.find("div", {"class": "col text-center"}).text,
}
questionlist.append(question)
return questionlist
out = []
for x in range(1, 5):
out.extend(getQuestions("bygg", x))
print(len(out))
Prints:
80
Try changing your url to this:
url = f'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}'
You didn't quite have your f-Strings set up right

Formatting scraped data Python Beautifulsoup

I am trying to scrape from this URL https://www.spoonflower.com/en/shop?on=fabric design names, creator names, fabric types, prices as per fabric type
The good thing is they have public API endpoints which make the data extraction simple
But the problem is they have different URLs for design names and for pricing
i.e to collect names of design and the creator name I have to ping this URL https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
And for pricing per fabric type requesting this endpoint
https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
I am getting correct data but the problem I stumbled across some formatting issues.
What I am looking for is something like this.
Each design with its fabric type alongside its prices in a single row. Instead, I am getting this kind of output
It would be great if anyone here can guide me through this like how to get the expected_output_result I am looking for.
Below is my code:
import requests
from bs4 import BeautifulSoup
import json
import csv
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': 'https://www.spoonflower.com',
'Connection': 'keep-alive',
'Referer': 'https://www.spoonflower.com/',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
}
res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
fabric.append(("_".join(fab.upper().split())))
#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
scraped_items = []
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name'],
screeName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
try:
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
except:
test_swatch_meter = 'N/A'
try:
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
except:
fat_quarter_meter = 'N/A'
try:
meter = details_endpoint_response['data']['pricing']['METER']['price']
except:
meter = 'N/A'
scraped_items.append({
'designName': designName,
'screenName': screeName,
'fabric_name': fabric_name,
'test_swatch_meter': test_swatch_meter,
'fat_quarter_meter': fat_quarter_meter,
'meter': meter
})
print(designName, screeName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
print(json.dumps(scraped_items, indent=2))
#print(type(details_endpoint))
#print(type(items_json['page_results'][0]))
with open('scraped_data.csv', 'w', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=scraped_items[0].keys())
writer.writeheader()
for row in scraped_items:
writer.writerow(row)
#print(fabric)
One way to do it is just reconfigure how you construct the output. Instead of a list, use a dictionary where designName, screenName, followed by the values. One thing to keep in mind is dictionaries don't allow duplicate keys, so had to number the column names, however you can remove those later if you'd like.
See if this gets what you are wanting:
import requests
from bs4 import BeautifulSoup
import json
import csv
import pandas as pd
from collections import OrderedDict
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': 'https://www.spoonflower.com',
'Connection': 'keep-alive',
'Referer': 'https://www.spoonflower.com/',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
}
res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
fabric.append(("_".join(fab.upper().split())))
#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
items_dict = OrderedDict()
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name']
screenName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
try:
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
except:
test_swatch_meter = 'N/A'
try:
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
except:
fat_quarter_meter = 'N/A'
try:
meter = details_endpoint_response['data']['pricing']['METER']['price']
except:
meter = 'N/A'
if (designName, screenName) not in items_dict.keys():
items_dict[(designName, screenName)] = {}
itemCount = len(items_dict[(designName, screenName)].values()) / 4
items_dict[(designName, screenName)].update({'fabric_name_%02d' %itemCount: fabric_name,
'test_swatch_meter_%02d' %itemCount: test_swatch_meter,
'fat_quarter_meter_%02d' %itemCount: fat_quarter_meter,
'meter_%02d' %itemCount: meter})
print(designName, screenName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
df = pd.DataFrame.from_dict(items_dict, orient='index').reset_index(drop=False)
df = df.rename(columns={'level_0':'designName','level_1':'screenName'})
df.to_csv('scraped_data.csv', index=False)

Categories

Resources