CSV file and loop list of URL in Python - python

I've been trying to loop a CSV file, a list of URL, with this code, to scrape and store data in Excel. With one URL I could do it, but cant seem to find a way to do that with a list of URL (stock market tickers). This is my code:
import requests
import json
import csv
import pandas as pd
Urls = open('AcoesURLJsonCompleta.csv')
for row in Urls:
obj_id = row.strip().split(',')
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
jsonData = requests.get(row, headers=headers).json()
data = {
'Ticker': [],
'Beta': [],
'DY': [],
'VOL': [],
'P/L': [],
'Cresc5A': [],
'LPA': [],
'VPA': [],
'Ultimo': []
}
ticker = jsonData['ric']
beta = jsonData['beta']
DY = jsonData['current_dividend_yield_ttm']
VOL = jsonData['share_volume_3m']
PL = jsonData['pe_normalized_annual']
cresc5a = jsonData['eps_growth_5y']
LPA = jsonData['eps_normalized_annual']
VPA = jsonData['book_value_share_quarterly']
Ultimo = jsonData['last']
data['Ticker'].append(ticker)
data['Beta'].append(beta)
data['DY'].append(DY)
data['VOL'].append(VOL)
data['P/L'].append(PL)
data['Cresc5A'].append(cresc5a)
data['LPA'].append(LPA)
data['VPA'].append(VPA)
data['Ultimo'].append(Ultimo)
table = pd.DataFrame(data, columns=['Ticker', 'Beta', 'DY', 'VOL', 'P/L', 'Cresc5A', 'LPA', 'VPA', 'Ultimo'])
table.index = table.index + 1
table.to_csv('CompleteData.csv', sep=',', encoding='utf-8', index=False)
print(table)
The output is always a KeyError:with those jsonData, as KeyError: 'beta' for example. How to fix this?

Assuming your urls are valid and you don't have other validation errors (like KeyError), you need to loop through all of them and build a dataframe for each. Then append the dataframe to the csv file, with a structure such as:
for row in Urls:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
jsonData = requests.get(row, headers=headers).json()
data = {
'Ticker': [],
'Beta': [],
'DY': [],
'VOL': [],
'P/L': [],
'Cresc5A': [],
'LPA': [],
'VPA': [],
'Ultimo': []
}
ticker = jsonData['ric']
beta = jsonData['beta']
DY = jsonData['current_dividend_yield_ttm']
VOL = jsonData['share_volume_3m']
PL = jsonData['pe_normalized_annual']
cresc5a = jsonData['eps_growth_5y']
LPA = jsonData['eps_normalized_annual']
VPA = jsonData['book_value_share_quarterly']
Ultimo = jsonData['last']
data['Ticker'].append(ticker)
data['Beta'].append(beta)
data['DY'].append(DY)
data['VOL'].append(VOL)
data['P/L'].append(PL)
data['Cresc5A'].append(cresc5a)
data['LPA'].append(LPA)
data['VPA'].append(VPA)
data['Ultimo'].append(Ultimo)
table = pd.DataFrame(data, columns=['Ticker', 'Beta', 'DY', 'VOL', 'P/L', 'Cresc5A', 'LPA', 'VPA', 'Ultimo'])
with open("append_to_csv.csv", 'a') as f:
table.to_csv(f, mode='a', header=not f.tell(), index=False)

Seems to me you're using beta instead of Beta. Just fix the capital letter.

Related

Can you scrape multiple stock prices on a recurring schedule?

With the current code, I can scrape multiple prices, but it doesn't automatically re-scrape them every 2 minutes which is what I need.
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
mystocks = ['GOOG', 'META', 'MSFT', 'PLTR', 'TSLA', 'ZS', 'PYPL', 'SHOP', 'TTCF']
stockdata = []
def getData(symbol):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
stock = {
'symbol': symbol,
'price': soup.find('div', {'class':'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
}
return stock
for item in mystocks:
stockdata.append(getData(item))
def export_data(stockdata):
df = pd.DataFrame(stockdata)
df.to_excel("LETS GO2.xlsx")
if __name__ == '__main__':
while True:
getData(item)
export_data(stockdata)
time_wait = 2
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait * 60)
Your for-loop is at the wrong place.
Try to put it in your while True: block to loop over every ticker every two minutes.
EDIT:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
mystocks = ['GOOG', 'META', 'MSFT', 'PLTR', 'TSLA', 'ZS', 'PYPL', 'SHOP', 'TTCF']
def getData(symbol):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
stock = {
'symbol': symbol,
'price': soup.find('div', {'class': 'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
}
return stock
def export_data(stockdata):
df = pd.DataFrame(stockdata)
df.to_excel("LETS GO2.xlsx")
if __name__ == "__main__":
while True:
stockdata = []
for item in mystocks:
print(item)
stockdata.append(getData(item))
export_data(stockdata)
time_wait = 0.1
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait * 60)

How to change Json data output in table format

import requests
from pprint import pprint
import pandas as pd
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
'like Gecko) '
'Chrome/80.0.3987.149 Safari/537.36',
'accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
print(res.json())
I tried df = pd.DataFrame(res.json()) but couldn't get data in table format. How to do that Plz. Also how to select few particular columns only in data output instead of all columns.
Try this :
import json
import codecs
df = pd.DataFrame(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
And to select a specific columns, you can use :
mini_df = df[['symbol', 'latestOI', 'prevOI', 'changeInOI', 'avgInOI']]
>>> print(mini_df)

Create Rows and Columns in BeautifulSoup

Below is code python code output.I want output in rows and column in dataframe:
response = requests.get(source_data)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
for item in States :
state_name = item.find(class_='fw-bold fs-5 mb-2').text
vaccinated_per = item.find(class_='col-3 text-end fs-5 ff-s text-success').text
print(state_name,vaccinated_per)
Output:
Flanders 80.24%
Wallonia 70.00%
Brussels 56.73%
Ostbelgien 65.11%
Collect your information in a list of dicts and then simply create a data frame from it:
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get('https://covid-vaccinatie.be/en', headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Output
state_name vaccinated_per
0 Flanders 80.24%
1 Wallonia 70.00%
2 Brussels 56.73%
3 Ostbelgien 65.11%

Yandex Spellchecker API Returns Empty Array

I am trying to harness a Russian language spellcheck API, Yandex.Speller.
The request seems to work fine in my browser. However, when I use a python script, the response is empty.
I am stumped as to what I am doing wrong.
Here is my code:
import urllib
from urllib.request import urlopen
import json
def main():
api(text_preproc())
def text_preproc():
""" Takes misspelled word/phrase,
“t”, and prepares it for
API request
"""
t = "синхрафазатрон в дубне"
text = t.replace(" ", "+")
return text
def diff_api(text):
my_url = "https://speller.yandex.net/services/spellservice.json/checkText?text="
my_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
my_data = {
"text" : text,
"lang" : "ru",
"format" : "plain"}
my_uedata = urllib.parse.urlencode(my_data)
my_edata = my_uedata.encode('ascii')
req = urllib.request.Request(url=my_url, data=my_edata, headers=my_headers)
response = urlopen(req)
data = json.load(response)
print(data)
The response is always an empty array, no matter how I tinker with my request.
Any insight into what I might be doing wrong?
my_uedata has to be a part of the URL you send the request to.
Also, in:
def main():
api(text_preproc())
You call api() but the function is not defined. I've used diff_api().
Try this:
import json
import urllib
from urllib.request import urlopen
def main():
diff_api(text_preproc("синхрафазатрон в дубне"))
def text_preproc(phrase):
""" Takes misspelled word/phrase,
“t”, and prepares it for
API request
"""
return phrase.replace(" ", "+")
def diff_api(text):
my_url = "https://speller.yandex.net/services/spellservice.json/checkText?text="
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
my_data = {
"text": text,
"lang": "ru",
"format": "plain"}
my_uedata = urllib.parse.urlencode(my_data)
req = urllib.request.Request(url=my_url+my_uedata, headers=my_headers)
data = json.load(urlopen(req))
print(data)
main()
Output:
[{'code': 1, 'pos': 5, 'row': 0, 'col': 5, 'len': 14, 'word': 'синхрафазатрон', 's': ['синхрофазотрон', 'синхрофазатрон', 'синхрофазотрона']}]

Pandas include Key to json file

import requests
import pandas as pd
import json
url = 'http://www.fundamentus.com.br/resultado.php'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
fundamentus = requests.get(url, headers=headers)
dfs = pd.read_html(fundamentus.text)
table = dfs[0]
table.to_json('table7.json', orient='records', indent=2)
this is giving me the following:
[{
"Papel":"VNET3",
"Cota\u00e7\u00e3o":0.0,
"P\/L":0.0,
"P\/VP":0.0,
"PSR":0.0,
"Div.Yield":"0,00%",
"P\/Ativo":0.0,
"P\/Cap.Giro":0,
"P\/EBIT":0.0,
"P\/Ativ Circ.Liq":0,
"EV\/EBIT":0.0,
"EV\/EBITDA":0.0,
"Mrg Ebit":"0,00%",
"Mrg. L\u00edq.":"0,00%",
"Liq. Corr.":0,
"ROIC":"0,00%",
"ROE":"12,99%",
"Liq.2meses":"000",
"Patrim. L\u00edq":"9.257.250.00000",
"D\u00edv.Brut\/ Patrim.":0.0,
"Cresc. Rec.5a":"-2,71%"
},
{
"Papel":"CFLU4",
"Cota\u00e7\u00e3o":1.0,
"P\/L":0.0,
"P\/VP":0.0,
"PSR":0.0,
"Div.Yield":"0,00%",
"P\/Ativo":0.0,
"P\/Cap.Giro":0,
"P\/EBIT":0.0,
"P\/Ativ Circ.Liq":0,
"EV\/EBIT":0.0,
"EV\/EBITDA":0.0,
"Mrg Ebit":"8,88%",
"Mrg. L\u00edq.":"10,72%",
"Liq. Corr.":110,
"ROIC":"17,68%",
"ROE":"32,15%",
"Liq.2meses":"000",
"Patrim. L\u00edq":"60.351.00000",
"D\u00edv.Brut\/ Patrim.":6.0,
"Cresc. Rec.5a":"8,14%"
}
]
But I need the following.
[ VNET3 = {
"Cota\u00e7\u00e3o":0.0,
"P\/L":0.0,
"P\/VP":0.0,
"PSR":0.0,
"Div.Yield":"0,00%",
"P\/Ativo":0.0,
"P\/Cap.Giro":0,
"P\/EBIT":0.0,
"P\/Ativ Circ.Liq":0,
"EV\/EBIT":0.0,
"EV\/EBITDA":0.0,
"Mrg Ebit":"0,00%",
"Mrg. L\u00edq.":"0,00%",
"Liq. Corr.":0,
"ROIC":"0,00%",
"ROE":"12,99%",
"Liq.2meses":"000",
"Patrim. L\u00edq":"9.257.250.00000",
"D\u00edv.Brut\/ Patrim.":0.0,
"Cresc. Rec.5a":"-2,71%"
},
CFLU4 = {
"Cota\u00e7\u00e3o":1.0,
"P\/L":0.0,
"P\/VP":0.0,
"PSR":0.0,
"Div.Yield":"0,00%",
"P\/Ativo":0.0,
"P\/Cap.Giro":0,
"P\/EBIT":0.0,
"P\/Ativ Circ.Liq":0,
"EV\/EBIT":0.0,
"EV\/EBITDA":0.0,
"Mrg Ebit":"8,88%",
"Mrg. L\u00edq.":"10,72%",
"Liq. Corr.":110,
"ROIC":"17,68%",
"ROE":"32,15%",
"Liq.2meses":"000",
"Patrim. L\u00edq":"60.351.00000",
"D\u00edv.Brut\/ Patrim.":6.0,
"Cresc. Rec.5a":"8,14%"
}
]
The enconding is comming wrongly as well.
For example: "Cota\u00e7\u00e3o"
I tried: table.to_json('table7.json',**force_ascii=True**, orient='records', indent=2)
I also tried.
table.to_json('table7.json',**encoding='utf8'**, orient='records', indent=2)
But no success.
So I tried to read with json because the Idea was read it and convert it.
This is the json reader statement.
jasonfile = open('table7.json', 'r')
stocks = jasonfile.read()
jason_object = json.loads(stocks)
print(str(jason_object['Papel']))
But I've got.
**print(str(jason_object['Papel']))
TypeError: list indices must be integers or slices, not str**
Thanks in advance.
You have list with many dictionaries so you have to use index like [0] to get one dictionary
print( jason_object[0]['Papel'] )
And text Cota\u00e7\u00e3o can be correct. It is how JSON keeps native chars.
But if you print it
print('Cota\u00e7\u00e3o')
then you should get
Cotação
When I run
for key in jason_object[0].keys():
print(key)
then I get on screen
VNET3
Papel
Cotação
P/L
P/VP
PSR
Div.Yield
P/Ativo
P/Cap.Giro
P/EBIT
P/Ativ Circ.Liq
EV/EBIT
EV/EBITDA
Mrg Ebit
Mrg. Líq.
Liq. Corr.
ROIC
ROE
Liq.2meses
Patrim. Líq
Dív.Brut/ Patrim.
Cresc. Rec.5a
But if I open table7.json in text editor then I see Cota\u00e7\u00e3o
List [ VNET3 = { .. }] it is not correct JSON nor Python structure.
Correct JSON and Python structure is dictionary { "VNET3": { .. } }
new_data = dict()
for item in jason_object:
key = item['Papel']
item.pop('Papel')
val = item
new_data[key] = val
print(new_data)
Minimal working code
import requests
import pandas as pd
import json
url = 'http://www.fundamentus.com.br/resultado.php'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
response = requests.get(url, headers=headers)
dfs = pd.read_html(response.text)
table = dfs[0]
table.to_json('table7.json', orient='records', indent=2)
jasonfile = open('table7.json', 'r')
jason_object = json.loads(jasonfile.read())
#print(jason_object[0]['Papel'])
#for key in jason_object[0].keys():
# print(key)
new_data = dict()
for item in jason_object:
key = item['Papel']
item.pop('Papel')
val = item
new_data[key] = val
print(new_data)
Tested on Python 3.7, Linux Mint which as default uses UTF-8 in console/terminal.

Categories

Resources