I would like to create a web scraper that collects the specific holdings of an ETF. I found that Zacks.com creates a nice list of what I am looking for. I am trying to use BeautifulSoup however I am having a difficult time pinpointing the data under in the "Symbol" column. What do I need to change or add to collect all the symbols as a list?
import requests
from bs4 import BeautifulSoup
tickers = ["XLU","XLRE"] #list of tickers whose financial data needs to be extracted
financial_dir = {}
for ticker in tickers:
#getting holdings data from Zacks for the given ticker
temp_dir = {}
url = 'https://www.zacks.com/funds/etf/'+ticker+'/holding'
page = requests.get(url)
page_content = page.content
soup = BeautifulSoup(page_content,'html.parser')
tabl = soup.find_all("table", {"id" : "etf_holding_table"})
for t in tabl:
rows = t.find_all("button")
import requests
import re
keys = ['XLU', 'XLRE']
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
for key in keys:
r = req.get(url.format(key))
print(f"Extracting: {r.url}")
goal = re.findall(r'etf\\\/(.*?)\\', r.text)
print(goal)
main("https://www.zacks.com/funds/etf/{}/holding")
Output:
Extracting: https://www.zacks.com/funds/etf/XLU/holding
['NEE', 'DUK', 'D', 'SO', 'AEP', 'XEL', 'EXC', 'SRE', 'WEC', 'ES', 'PEG', 'AWK', 'ED', 'DTE', 'PPL', 'ETR', 'AEE', 'EIX', 'CMS', 'FE', 'LNT', 'AES', 'ATO', 'EVRG', 'CNP', 'PNW', 'NI', 'NRG']
Extracting: https://www.zacks.com/funds/etf/XLRE/holding
['AMT', 'PLD', 'CCI', 'EQIX', 'DLR', 'PSA', 'SBAC', 'WELL', 'AVB', 'O', 'WY', 'SPG', 'ARE', 'EQR', 'VTR', 'CBRE', 'PEAK', 'EXR', 'DRE', 'MAA', 'ESS', 'BXP', 'UDR', 'HST', 'IRM', 'REG', 'VNO', 'AIV', 'FRT', 'KIM', 'SLG']
To answer Andrew Hick's comment, here is a version you can use to get weights, for etfs and mutual funds. You'll have to play with formatting a bit.
import requests
import re
etf_keys = ['XLU', 'XLRE']
mutual_fund_keys = ['VFTAX']
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
}
def main_etf(url):
with requests.Session() as req:
req.headers.update(headers)
for key in etf_keys:
r = req.get(url.format(key))
print(f"Extracting: {r.url}")
etf_stock_list = re.findall(r'etf\\\/(.*?)\\', r.text)
print(etf_stock_list)
etf_stock_details_list = re.findall(r'<\\\/span><\\\/span><\\\/a>",(.*?), "<a class=\\\"report_document newwin\\', r.text)
print(etf_stock_details_list)
def main_mutual(url):
with requests.Session() as req:
req.headers.update(headers)
for key in mutual_fund_keys:
r = req.get(url.format(key))
print(f"Extracting: {r.url}")
mutual_stock_list = re.findall(r'\\\/mutual-fund\\\/quote\\\/(.*?)\\', r.text)
print(mutual_stock_list)
mutual_stock_details_list = re.findall(r'"sr-only\\\"><\\\/span><\\\/span><\\\/a>",(.*?)%", "', r.text)
print(mutual_stock_details_list)
main_etf("https://www.zacks.com/funds/etf/{}/holding")
main_mutual("https://www.zacks.com/funds/mutual-fund/quote/{}/holding")
Related
With the current code, I can scrape multiple prices, but it doesn't automatically re-scrape them every 2 minutes which is what I need.
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
mystocks = ['GOOG', 'META', 'MSFT', 'PLTR', 'TSLA', 'ZS', 'PYPL', 'SHOP', 'TTCF']
stockdata = []
def getData(symbol):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
stock = {
'symbol': symbol,
'price': soup.find('div', {'class':'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
}
return stock
for item in mystocks:
stockdata.append(getData(item))
def export_data(stockdata):
df = pd.DataFrame(stockdata)
df.to_excel("LETS GO2.xlsx")
if __name__ == '__main__':
while True:
getData(item)
export_data(stockdata)
time_wait = 2
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait * 60)
Your for-loop is at the wrong place.
Try to put it in your while True: block to loop over every ticker every two minutes.
EDIT:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
mystocks = ['GOOG', 'META', 'MSFT', 'PLTR', 'TSLA', 'ZS', 'PYPL', 'SHOP', 'TTCF']
def getData(symbol):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
stock = {
'symbol': symbol,
'price': soup.find('div', {'class': 'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
}
return stock
def export_data(stockdata):
df = pd.DataFrame(stockdata)
df.to_excel("LETS GO2.xlsx")
if __name__ == "__main__":
while True:
stockdata = []
for item in mystocks:
print(item)
stockdata.append(getData(item))
export_data(stockdata)
time_wait = 0.1
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait * 60)
Below is code python code output.I want output in rows and column in dataframe:
response = requests.get(source_data)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
for item in States :
state_name = item.find(class_='fw-bold fs-5 mb-2').text
vaccinated_per = item.find(class_='col-3 text-end fs-5 ff-s text-success').text
print(state_name,vaccinated_per)
Output:
Flanders 80.24%
Wallonia 70.00%
Brussels 56.73%
Ostbelgien 65.11%
Collect your information in a list of dicts and then simply create a data frame from it:
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get('https://covid-vaccinatie.be/en', headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Output
state_name vaccinated_per
0 Flanders 80.24%
1 Wallonia 70.00%
2 Brussels 56.73%
3 Ostbelgien 65.11%
I try many method but they will not solve they show me the error:
All arrays must be of the same length
from bs4 import BeautifulSoup
import requests
import pandas as pd
review = []
ratings = []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Accept-Encoding": "gzip, deflate",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"DNT": "1",
"Connection": "close",
"Upgrade-Insecure-Requests": "1",
}
for page in range(1, 5):
r = requests.get(
"https://www.amazon.com/s?k=redmi&page=2&qid=1631528810&ref=sr_pg_={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(r.content, "lxml")
for d in soup.findAll("div", attrs={"class": "s-result-item"}):
rating = d.find("span", attrs={"class": "a-icon-alt"})
if rating is not None:
ratings.append(rating.text)
reviews = d.find("span", class_="a-size-base")
if reviews is not None:
review.append(reviews.text)
df = pd.DataFrame({"rating": ratings, "reviews": review})
df.to_csv("products .csv", index=False, encoding="utf-8")
ratings and reviews aren't the same length and you were scraping wrong containers. I made the necessary modifications, now it should work:
from bs4 import BeautifulSoup
import requests
import pandas as pd
review = []
ratings = []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Accept-Encoding": "gzip, deflate",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"DNT": "1",
"Connection": "close",
"Upgrade-Insecure-Requests": "1",
}
for page in range(1, 5):
cookies = {'session': '17ab96bd8ffbe8ca58a78657a918558'}
r = requests.get(
"https://www.amazon.com/s?k=redmi&page=2&qid=1631528810&ref=sr_pg_={page}".format(
page=page
),
headers=headers,
cookies =cookies
)
soup = BeautifulSoup(r.content, "lxml")
for d in soup.select(".s-result-item[data-component-type='s-search-result']"):
rating = d.find("span", attrs={"class": "a-icon-alt"})
if rating is not None:
ratings.append(rating.text)
else:
ratings.append("-")
reviews = d.find("span", class_="a-size-base")
if reviews is not None and rating is not None:
review.append(reviews.text)
else:
review.append("-")
df = pd.DataFrame({"rating": ratings, "reviews": review})
df.to_csv("products.csv", index=False, encoding="utf-8")
I am trying to scrape from this URL https://www.spoonflower.com/en/shop?on=fabric design names, creator names, fabric types, prices as per fabric type
The good thing is they have public API endpoints which make the data extraction simple
But the problem is they have different URLs for design names and for pricing
i.e to collect names of design and the creator name I have to ping this URL https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
And for pricing per fabric type requesting this endpoint
https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK¤cy=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
I am getting correct data but the problem I stumbled across some formatting issues.
What I am looking for is something like this.
Each design with its fabric type alongside its prices in a single row. Instead, I am getting this kind of output
It would be great if anyone here can guide me through this like how to get the expected_output_result I am looking for.
Below is my code:
import requests
from bs4 import BeautifulSoup
import json
import csv
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': 'https://www.spoonflower.com',
'Connection': 'keep-alive',
'Referer': 'https://www.spoonflower.com/',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
}
res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
fabric.append(("_".join(fab.upper().split())))
#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK¤cy=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
scraped_items = []
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK¤cy=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name'],
screeName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
try:
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
except:
test_swatch_meter = 'N/A'
try:
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
except:
fat_quarter_meter = 'N/A'
try:
meter = details_endpoint_response['data']['pricing']['METER']['price']
except:
meter = 'N/A'
scraped_items.append({
'designName': designName,
'screenName': screeName,
'fabric_name': fabric_name,
'test_swatch_meter': test_swatch_meter,
'fat_quarter_meter': fat_quarter_meter,
'meter': meter
})
print(designName, screeName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
print(json.dumps(scraped_items, indent=2))
#print(type(details_endpoint))
#print(type(items_json['page_results'][0]))
with open('scraped_data.csv', 'w', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=scraped_items[0].keys())
writer.writeheader()
for row in scraped_items:
writer.writerow(row)
#print(fabric)
One way to do it is just reconfigure how you construct the output. Instead of a list, use a dictionary where designName, screenName, followed by the values. One thing to keep in mind is dictionaries don't allow duplicate keys, so had to number the column names, however you can remove those later if you'd like.
See if this gets what you are wanting:
import requests
from bs4 import BeautifulSoup
import json
import csv
import pandas as pd
from collections import OrderedDict
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': 'https://www.spoonflower.com',
'Connection': 'keep-alive',
'Referer': 'https://www.spoonflower.com/',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
}
res = requests.get('https://www.spoonflower.com/spoonflower_fabrics')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
fabric.append(("_".join(fab.upper().split())))
#https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en
#https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_PETAL_SIGNATURE_COTTON?quantity=1&shipping_country=PK¤cy=EUR&measurement_system=METRIC&design_id=6444170&page_locale=en
item_endpoint = 'https://pythias.spoonflower.com/search/v1/designs?lang=en&page_offset=0&sort=bestSelling&product=Fabric&forSale=true&showMatureContent=false&page_locale=en'
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
items_dict = OrderedDict()
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = 'https://api-gateway.spoonflower.com/alpenrose/pricing/fabrics/FABRIC_' + fab_type + '?quantity=1&shipping_country=PK¤cy=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name']
screenName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
try:
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
except:
test_swatch_meter = 'N/A'
try:
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
except:
fat_quarter_meter = 'N/A'
try:
meter = details_endpoint_response['data']['pricing']['METER']['price']
except:
meter = 'N/A'
if (designName, screenName) not in items_dict.keys():
items_dict[(designName, screenName)] = {}
itemCount = len(items_dict[(designName, screenName)].values()) / 4
items_dict[(designName, screenName)].update({'fabric_name_%02d' %itemCount: fabric_name,
'test_swatch_meter_%02d' %itemCount: test_swatch_meter,
'fat_quarter_meter_%02d' %itemCount: fat_quarter_meter,
'meter_%02d' %itemCount: meter})
print(designName, screenName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
df = pd.DataFrame.from_dict(items_dict, orient='index').reset_index(drop=False)
df = df.rename(columns={'level_0':'designName','level_1':'screenName'})
df.to_csv('scraped_data.csv', index=False)
So I tried the following rough version:
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
session = requests.Session()
res1 = session.get('http://www.instacart.com', headers=headers)
soup = BeautifulSoup(res1.content, 'html.parser')
token = soup.find('meta', {'name': 'csrf-token'}).get('content')
data = {"user": {"email": "user#gmail.com", "password": "xxxxx"},
"authenticity_token": token}
res2 = session.post('https://www.instacart.com/accounts/login', headers=headers, data=data)
print(res2)
I always get the following error:
<Response [400]>
apparent_encoding:'ascii'
connection:<requests.adapters.HTTPAdapter object at 0x0000021F3FF8F940>
content:b'{"status":400,"error":"There was a problem in the JSON you submitted: Empty input () at line 1, column 1"}'
What am I doing wrong?
Actually you were missing the correct Params for the POST request.
I've made a GET request to the main site to collect the necessary authenticity_token which is used within the POST request. and then made the POST request for the correct login url.
import requests
from bs4 import BeautifulSoup
params = {
'source': 'web',
'cache_key': 'undefined'
}
data = {
'email': 'email#email.com',
'grant_type': 'password',
'password': 'yourpassword',
'scope': '',
'signup_v3_endpoints_web': 'null'
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
}
def main(url):
with requests.Session() as req:
r = req.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
data['authenticity_token'] = soup.find(
"meta", {'name': 'csrf-token'}).get("content")
r = req.post(
"https://www.instacart.com/v3/dynamic_data/authenticate/login", params=params, json=data, headers=headers).json()
print(r)
main("https://www.instacart.com")