With the current code, I can scrape multiple prices, but it doesn't automatically re-scrape them every 2 minutes which is what I need.
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
mystocks = ['GOOG', 'META', 'MSFT', 'PLTR', 'TSLA', 'ZS', 'PYPL', 'SHOP', 'TTCF']
stockdata = []
def getData(symbol):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
stock = {
'symbol': symbol,
'price': soup.find('div', {'class':'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
}
return stock
for item in mystocks:
stockdata.append(getData(item))
def export_data(stockdata):
df = pd.DataFrame(stockdata)
df.to_excel("LETS GO2.xlsx")
if __name__ == '__main__':
while True:
getData(item)
export_data(stockdata)
time_wait = 2
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait * 60)
Your for-loop is at the wrong place.
Try to put it in your while True: block to loop over every ticker every two minutes.
EDIT:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
mystocks = ['GOOG', 'META', 'MSFT', 'PLTR', 'TSLA', 'ZS', 'PYPL', 'SHOP', 'TTCF']
def getData(symbol):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
stock = {
'symbol': symbol,
'price': soup.find('div', {'class': 'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
}
return stock
def export_data(stockdata):
df = pd.DataFrame(stockdata)
df.to_excel("LETS GO2.xlsx")
if __name__ == "__main__":
while True:
stockdata = []
for item in mystocks:
print(item)
stockdata.append(getData(item))
export_data(stockdata)
time_wait = 0.1
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait * 60)
Related
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'}
URL = "https://www.amazon.com/TRESemm%C3%A9-Botanique-Shampoo-Nourish-Replenish/dp/B0199WNJE8/ref=sxin_14_pa_sp_search_thematic_sspa?content-id=amzn1.sym.a15c61b7-4b93-404d-bb70-88600dfb718d%3Aamzn1.sym.a15c61b7-4b93-404d-bb70-88600dfb718d&crid=2HG5WSUDCJBMZ&cv_ct_cx=hair%2Btresemme&keywords=hair%2Btresemme&pd_rd_i=B0199WNJE8&pd_rd_r=28d72361-7f35-4b1a-be43-98e7103da70c&pd_rd_w=6UL4P&pd_rd_wg=JtUqB&pf_rd_p=a15c61b7-4b93-404d-bb70-88600dfb718d&pf_rd_r=DFPZNAG391M5JS55R6HP&qid=1660432925&sprefix=hair%2Btresemme%2Caps%2C116&sr=1-3-a73d1c8c-2fd2-4f19-aa41-2df022bcb241-spons&smid=A3DEFW12560V8M&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUExQlM3VFpGRVM5Tk8wJmVuY3J5cHRlZElkPUEwNjE5MjQwM01JV0FNN1pOMlRHSSZlbmNyeXB0ZWRBZElkPUEwNTA1MDQyMlQ5RjhRQUxIWEdaUiZ3aWRnZXROYW1lPXNwX3NlYXJjaF90aGVtYXRpYyZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1"
webpage = requests.get(URL, headers=headers)
soup = BeautifulSoup(webpage.content)
rank = soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[0]
Category = soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].get_text().split()[2:6]
Category = ' '.join(Category)
type(rank)
type(Category)
import string
for char in string.punctuation:
rank = rank.replace(char, '')
print(rank)
print(Category)
I have other URLs similar to this and I want to loop through them: Here are the links: How can I loop through them and save them to a csv file. Thank you very much in advanced!
URL = ['https://www.amazon.com/Dove-Intensive-Concentrate-Technology-Protects/dp/B0B1VVXTKL',
'https://www.amazon.com/Dove-Intensive-Concentrate-Conditioner-Technology/dp/B0B1VXFLQ2']
You could use a for-loop to iterate the list:
for url in URL:
webpage = requests.get(url, headers=headers)
soup = BeautifulSoup(webpage.content)
Note: amazon do not want to be scraped, so it is a question of time, that they will block you. May use some delay, rotating proxy, ...
Example
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'}
URL = ['https://www.amazon.com/Dove-Intensive-Concentrate-Technology-Protects/dp/B0B1VVXTKL',
'https://www.amazon.com/Dove-Intensive-Concentrate-Conditioner-Technology/dp/B0B1VXFLQ2']
data = []
for url in URL:
webpage = requests.get(url, headers=headers)
soup = BeautifulSoup(webpage.content)
data.append({
'url':url,
'rank':soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller")').contents[2].split()[0][1:],
'category':soup.select_one('#detailBulletsWrapper_feature_div span:-soup-contains("Best Seller") a').text.split('Top 100 in ')[-1]
})
pd.DataFrame(data).to_csv('myfile.csv', index=False)
Below is code python code output.I want output in rows and column in dataframe:
response = requests.get(source_data)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
for item in States :
state_name = item.find(class_='fw-bold fs-5 mb-2').text
vaccinated_per = item.find(class_='col-3 text-end fs-5 ff-s text-success').text
print(state_name,vaccinated_per)
Output:
Flanders 80.24%
Wallonia 70.00%
Brussels 56.73%
Ostbelgien 65.11%
Collect your information in a list of dicts and then simply create a data frame from it:
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get('https://covid-vaccinatie.be/en', headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Output
state_name vaccinated_per
0 Flanders 80.24%
1 Wallonia 70.00%
2 Brussels 56.73%
3 Ostbelgien 65.11%
I'm trying to scrape this website
https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who=bygg&bf=1&page=1
And I've put a def getQuestions(tag) in the who={tag} part of the url and that works fine. When I try to add def getQuestions(tag, page) page={page} it just returns 0 in the terminal, and I really hope no clue what could be causing this.
Here is the full code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
questionlist = []
def getQuestions(tag, page):
url = 'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={bygg}&bf=1&page={page}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
questions = soup.find_all('div', {'class': 'box-white p-0 mb-4'})
for item in questions:
question = {
'title': item.find('a', {'class': 'link-primary'}).text,
'link': item.find('a', {'class': 'link-primary'})['href'],
'nummer': item.find('a', {'class': 'link-body'})['href'],
'address': item.find('address', {'class': 'mt-2 mb-0'}).text,
'RegÅr': item.find('div', {'class': 'col text-center'}).text,
}
questionlist.append(question)
return
for x in range(1,5):
getQuestions('bygg', x)
print(len(questionlist))
Any help would be appreciated. Best regards!
Change the string in url variable to f-string:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
def getQuestions(tag, page):
questionlist = []
url = f"https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}"
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
questions = soup.find_all("div", {"class": "box-white p-0 mb-4"})
for item in questions:
question = {
"title": item.find("a", {"class": "link-primary"}).text,
"link": item.find("a", {"class": "link-primary"})["href"],
"nummer": item.find("a", {"class": "link-body"})["href"],
"address": item.find("address", {"class": "mt-2 mb-0"}).text,
"RegÅr": item.find("div", {"class": "col text-center"}).text,
}
questionlist.append(question)
return questionlist
out = []
for x in range(1, 5):
out.extend(getQuestions("bygg", x))
print(len(out))
Prints:
80
Try changing your url to this:
url = f'https://www.merinfo.se/search?d=c&ap=1&emp=0%3A20&rev=0%3A100&who={tag}&bf=1&page={page}'
You didn't quite have your f-Strings set up right
I would like to create a web scraper that collects the specific holdings of an ETF. I found that Zacks.com creates a nice list of what I am looking for. I am trying to use BeautifulSoup however I am having a difficult time pinpointing the data under in the "Symbol" column. What do I need to change or add to collect all the symbols as a list?
import requests
from bs4 import BeautifulSoup
tickers = ["XLU","XLRE"] #list of tickers whose financial data needs to be extracted
financial_dir = {}
for ticker in tickers:
#getting holdings data from Zacks for the given ticker
temp_dir = {}
url = 'https://www.zacks.com/funds/etf/'+ticker+'/holding'
page = requests.get(url)
page_content = page.content
soup = BeautifulSoup(page_content,'html.parser')
tabl = soup.find_all("table", {"id" : "etf_holding_table"})
for t in tabl:
rows = t.find_all("button")
import requests
import re
keys = ['XLU', 'XLRE']
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
}
def main(url):
with requests.Session() as req:
req.headers.update(headers)
for key in keys:
r = req.get(url.format(key))
print(f"Extracting: {r.url}")
goal = re.findall(r'etf\\\/(.*?)\\', r.text)
print(goal)
main("https://www.zacks.com/funds/etf/{}/holding")
Output:
Extracting: https://www.zacks.com/funds/etf/XLU/holding
['NEE', 'DUK', 'D', 'SO', 'AEP', 'XEL', 'EXC', 'SRE', 'WEC', 'ES', 'PEG', 'AWK', 'ED', 'DTE', 'PPL', 'ETR', 'AEE', 'EIX', 'CMS', 'FE', 'LNT', 'AES', 'ATO', 'EVRG', 'CNP', 'PNW', 'NI', 'NRG']
Extracting: https://www.zacks.com/funds/etf/XLRE/holding
['AMT', 'PLD', 'CCI', 'EQIX', 'DLR', 'PSA', 'SBAC', 'WELL', 'AVB', 'O', 'WY', 'SPG', 'ARE', 'EQR', 'VTR', 'CBRE', 'PEAK', 'EXR', 'DRE', 'MAA', 'ESS', 'BXP', 'UDR', 'HST', 'IRM', 'REG', 'VNO', 'AIV', 'FRT', 'KIM', 'SLG']
To answer Andrew Hick's comment, here is a version you can use to get weights, for etfs and mutual funds. You'll have to play with formatting a bit.
import requests
import re
etf_keys = ['XLU', 'XLRE']
mutual_fund_keys = ['VFTAX']
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
}
def main_etf(url):
with requests.Session() as req:
req.headers.update(headers)
for key in etf_keys:
r = req.get(url.format(key))
print(f"Extracting: {r.url}")
etf_stock_list = re.findall(r'etf\\\/(.*?)\\', r.text)
print(etf_stock_list)
etf_stock_details_list = re.findall(r'<\\\/span><\\\/span><\\\/a>",(.*?), "<a class=\\\"report_document newwin\\', r.text)
print(etf_stock_details_list)
def main_mutual(url):
with requests.Session() as req:
req.headers.update(headers)
for key in mutual_fund_keys:
r = req.get(url.format(key))
print(f"Extracting: {r.url}")
mutual_stock_list = re.findall(r'\\\/mutual-fund\\\/quote\\\/(.*?)\\', r.text)
print(mutual_stock_list)
mutual_stock_details_list = re.findall(r'"sr-only\\\"><\\\/span><\\\/span><\\\/a>",(.*?)%", "', r.text)
print(mutual_stock_details_list)
main_etf("https://www.zacks.com/funds/etf/{}/holding")
main_mutual("https://www.zacks.com/funds/mutual-fund/quote/{}/holding")
I have this url from coronavirus worldwide and I would like to pick only one number, the newcases in Arizona which is +2383 right now.
import requests
from bs4 import BeautifulSoup
import lxml
url = "https://www.worldmeter.com/coronavirus/us/"
page = requests.get("https://www.worldmeter.com/coronavirus/us/")
soup = BeautifulSoup(page.content, "lxml")
page.close()
newcases = soup.find('a', href_="https://worldmeter.com/coronavirus/arizona", class_="tableRowLinkYellow newCasesStates").get_text(strip=True)
print(newcases)
I get this error:
AttributeError: 'NoneType' object has no attribute 'get_text'
How do I pick only that number from the whole table? Thank you for your time.
Just like Linh said, it was generated by Javascript.Using selenium is an easy way but not efficient enough.(too slow)
You could scrape the API directly:
import requests
url = "https://worldmeter.com/coronavirus/wp-admin/admin-ajax.php?action=wp_ajax_ninja_tables_public_action&table_id=2582&target_action=get-all-data&default_sorting=old_first"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36",
}
results = requests.get(url, headers=headers).json()
for result in results:
if result["state_name"] == "Arizona":
print(result)
print("The newcases is", result["new_cases"])
And this gave me:
{'state_name': 'Arizona', 'positive': '275,436', 'new_cases': '2,383', 'death_in_states': '6,302', 'new_deaths': '2', 'recovered_states': '45,400', 'new_recovered': '364', 'totaltestresults': 'Arizona', 'postname': 'arizona', 'cases_100_k_population': '3,866.37', 'state_population': '7278717', 'death_100_k_population': '88.46'}
The newcases is 2,383