CSV file and loop list of URL in Python

CSV file and loop list of URL in Python - python

I've been trying to loop a CSV file, a list of URL, with this code, to scrape and store data in Excel. With one URL I could do it, but cant seem to find a way to do that with a list of URL (stock market tickers). This is my code:
import requests
import json
import csv
import pandas as pd
Urls = open('AcoesURLJsonCompleta.csv')
for row in Urls:
obj_id = row.strip().split(',')
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
jsonData = requests.get(row, headers=headers).json()
data = {
'Ticker': [],
'Beta': [],
'DY': [],
'VOL': [],
'P/L': [],
'Cresc5A': [],
'LPA': [],
'VPA': [],
'Ultimo': []
}
ticker = jsonData['ric']
beta = jsonData['beta']
DY = jsonData['current_dividend_yield_ttm']
VOL = jsonData['share_volume_3m']
PL = jsonData['pe_normalized_annual']
cresc5a = jsonData['eps_growth_5y']
LPA = jsonData['eps_normalized_annual']
VPA = jsonData['book_value_share_quarterly']
Ultimo = jsonData['last']
data['Ticker'].append(ticker)
data['Beta'].append(beta)
data['DY'].append(DY)
data['VOL'].append(VOL)
data['P/L'].append(PL)
data['Cresc5A'].append(cresc5a)
data['LPA'].append(LPA)
data['VPA'].append(VPA)
data['Ultimo'].append(Ultimo)
table = pd.DataFrame(data, columns=['Ticker', 'Beta', 'DY', 'VOL', 'P/L', 'Cresc5A', 'LPA', 'VPA', 'Ultimo'])
table.index = table.index + 1
table.to_csv('CompleteData.csv', sep=',', encoding='utf-8', index=False)
print(table)
The output is always a KeyError:with those jsonData, as KeyError: 'beta' for example. How to fix this?

Assuming your urls are valid and you don't have other validation errors (like KeyError), you need to loop through all of them and build a dataframe for each. Then append the dataframe to the csv file, with a structure such as:
for row in Urls:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
jsonData = requests.get(row, headers=headers).json()
data = {
'Ticker': [],
'Beta': [],
'DY': [],
'VOL': [],
'P/L': [],
'Cresc5A': [],
'LPA': [],
'VPA': [],
'Ultimo': []
}
ticker = jsonData['ric']
beta = jsonData['beta']
DY = jsonData['current_dividend_yield_ttm']
VOL = jsonData['share_volume_3m']
PL = jsonData['pe_normalized_annual']
cresc5a = jsonData['eps_growth_5y']
LPA = jsonData['eps_normalized_annual']
VPA = jsonData['book_value_share_quarterly']
Ultimo = jsonData['last']
data['Ticker'].append(ticker)
data['Beta'].append(beta)
data['DY'].append(DY)
data['VOL'].append(VOL)
data['P/L'].append(PL)
data['Cresc5A'].append(cresc5a)
data['LPA'].append(LPA)
data['VPA'].append(VPA)
data['Ultimo'].append(Ultimo)
table = pd.DataFrame(data, columns=['Ticker', 'Beta', 'DY', 'VOL', 'P/L', 'Cresc5A', 'LPA', 'VPA', 'Ultimo'])
with open("append_to_csv.csv", 'a') as f:
table.to_csv(f, mode='a', header=not f.tell(), index=False)

Seems to me you're using beta instead of Beta. Just fix the capital letter.

Related

Can you scrape multiple stock prices on a recurring schedule?

With the current code, I can scrape multiple prices, but it doesn't automatically re-scrape them every 2 minutes which is what I need.
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
mystocks = ['GOOG', 'META', 'MSFT', 'PLTR', 'TSLA', 'ZS', 'PYPL', 'SHOP', 'TTCF']
stockdata = []
def getData(symbol):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
stock = {
'symbol': symbol,
'price': soup.find('div', {'class':'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
}
return stock
for item in mystocks:
stockdata.append(getData(item))
def export_data(stockdata):
df = pd.DataFrame(stockdata)
df.to_excel("LETS GO2.xlsx")
if __name__ == '__main__':
while True:
getData(item)
export_data(stockdata)
time_wait = 2
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait * 60)

Your for-loop is at the wrong place.
Try to put it in your while True: block to loop over every ticker every two minutes.
EDIT:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
mystocks = ['GOOG', 'META', 'MSFT', 'PLTR', 'TSLA', 'ZS', 'PYPL', 'SHOP', 'TTCF']
def getData(symbol):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
url = f'https://finance.yahoo.com/quote/{symbol}'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
stock = {
'symbol': symbol,
'price': soup.find('div', {'class': 'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text,
}
return stock
def export_data(stockdata):
df = pd.DataFrame(stockdata)
df.to_excel("LETS GO2.xlsx")
if __name__ == "__main__":
while True:
stockdata = []
for item in mystocks:
print(item)
stockdata.append(getData(item))
export_data(stockdata)
time_wait = 0.1
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait * 60)

How to change Json data output in table format

import requests
from pprint import pprint
import pandas as pd
baseurl = "https://www.nseindia.com/"
url = f'https://www.nseindia.com/api/live-analysis-oi-spurts-underlyings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
'like Gecko) '
'Chrome/80.0.3987.149 Safari/537.36',
'accept-language': 'en,gu;q=0.9,hi;q=0.8', 'accept-encoding': 'gzip, deflate, br'}
session = requests.Session()
request = session.get(baseurl, headers=headers, timeout=30)
cookies = dict(request.cookies)
res = session.get(url, headers=headers, timeout=30, cookies=cookies)
print(res.json())
I tried df = pd.DataFrame(res.json()) but couldn't get data in table format. How to do that Plz. Also how to select few particular columns only in data output instead of all columns.

Try this :
import json
import codecs
df = pd.DataFrame(json.loads(codecs.decode(bytes(res.text, 'utf-8'), 'utf-8-sig'))['data'])
And to select a specific columns, you can use :
mini_df = df[['symbol', 'latestOI', 'prevOI', 'changeInOI', 'avgInOI']]
>>> print(mini_df)

Create Rows and Columns in BeautifulSoup

Below is code python code output.I want output in rows and column in dataframe:
response = requests.get(source_data)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
for item in States :
state_name = item.find(class_='fw-bold fs-5 mb-2').text
vaccinated_per = item.find(class_='col-3 text-end fs-5 ff-s text-success').text
print(state_name,vaccinated_per)
Output:
Flanders 80.24%
Wallonia 70.00%
Brussels 56.73%
Ostbelgien 65.11%

Collect your information in a list of dicts and then simply create a data frame from it:
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Example
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get('https://covid-vaccinatie.be/en', headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
States = soup.find_all('div',class_ = 'card bg-darker p-3 mb-3')
data = []
for item in States :
data.append({
'state_name' : item.find(class_='fw-bold fs-5 mb-2').text,
'vaccinated_per' : item.find(class_='col-3 text-end fs-5 ff-s text-success').text
})
pd.DataFrame(data)
Output
state_name vaccinated_per
0 Flanders 80.24%
1 Wallonia 70.00%
2 Brussels 56.73%
3 Ostbelgien 65.11%

Yandex Spellchecker API Returns Empty Array

I am trying to harness a Russian language spellcheck API, Yandex.Speller.
The request seems to work fine in my browser. However, when I use a python script, the response is empty.
I am stumped as to what I am doing wrong.
Here is my code:
import urllib
from urllib.request import urlopen
import json
def main():
api(text_preproc())
def text_preproc():
""" Takes misspelled word/phrase,
“t”, and prepares it for
API request
"""
t = "синхрафазатрон в дубне"
text = t.replace(" ", "+")
return text
def diff_api(text):
my_url = "https://speller.yandex.net/services/spellservice.json/checkText?text="
my_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
my_data = {
"text" : text,
"lang" : "ru",
"format" : "plain"}
my_uedata = urllib.parse.urlencode(my_data)
my_edata = my_uedata.encode('ascii')
req = urllib.request.Request(url=my_url, data=my_edata, headers=my_headers)
response = urlopen(req)
data = json.load(response)
print(data)
The response is always an empty array, no matter how I tinker with my request.
Any insight into what I might be doing wrong?

my_uedata has to be a part of the URL you send the request to.
Also, in:
def main():
api(text_preproc())
You call api() but the function is not defined. I've used diff_api().
Try this:
import json
import urllib
from urllib.request import urlopen
def main():
diff_api(text_preproc("синхрафазатрон в дубне"))
def text_preproc(phrase):
""" Takes misspelled word/phrase,
“t”, and prepares it for
API request
"""
return phrase.replace(" ", "+")
def diff_api(text):
my_url = "https://speller.yandex.net/services/spellservice.json/checkText?text="
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
my_data = {
"text": text,
"lang": "ru",
"format": "plain"}
my_uedata = urllib.parse.urlencode(my_data)
req = urllib.request.Request(url=my_url+my_uedata, headers=my_headers)
data = json.load(urlopen(req))
print(data)
main()
Output:
[{'code': 1, 'pos': 5, 'row': 0, 'col': 5, 'len': 14, 'word': 'синхрафазатрон', 's': ['синхрофазотрон', 'синхрофазатрон', 'синхрофазотрона']}]

Pandas include Key to json file

import requests
import pandas as pd
import json
url = 'http://www.fundamentus.com.br/resultado.php'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
fundamentus = requests.get(url, headers=headers)
dfs = pd.read_html(fundamentus.text)
table = dfs[0]
table.to_json('table7.json', orient='records', indent=2)
this is giving me the following:
[{
"Papel":"VNET3",
"Cota\u00e7\u00e3o":0.0,
"P\/L":0.0,
"P\/VP":0.0,
"PSR":0.0,
"Div.Yield":"0,00%",
"P\/Ativo":0.0,
"P\/Cap.Giro":0,
"P\/EBIT":0.0,
"P\/Ativ Circ.Liq":0,
"EV\/EBIT":0.0,
"EV\/EBITDA":0.0,
"Mrg Ebit":"0,00%",
"Mrg. L\u00edq.":"0,00%",
"Liq. Corr.":0,
"ROIC":"0,00%",
"ROE":"12,99%",
"Liq.2meses":"000",
"Patrim. L\u00edq":"9.257.250.00000",
"D\u00edv.Brut\/ Patrim.":0.0,
"Cresc. Rec.5a":"-2,71%"
},
{
"Papel":"CFLU4",
"Cota\u00e7\u00e3o":1.0,
"P\/L":0.0,
"P\/VP":0.0,
"PSR":0.0,
"Div.Yield":"0,00%",
"P\/Ativo":0.0,
"P\/Cap.Giro":0,
"P\/EBIT":0.0,
"P\/Ativ Circ.Liq":0,
"EV\/EBIT":0.0,
"EV\/EBITDA":0.0,
"Mrg Ebit":"8,88%",
"Mrg. L\u00edq.":"10,72%",
"Liq. Corr.":110,
"ROIC":"17,68%",
"ROE":"32,15%",
"Liq.2meses":"000",
"Patrim. L\u00edq":"60.351.00000",
"D\u00edv.Brut\/ Patrim.":6.0,
"Cresc. Rec.5a":"8,14%"
}
]
But I need the following.
[ VNET3 = {
"Cota\u00e7\u00e3o":0.0,
"P\/L":0.0,
"P\/VP":0.0,
"PSR":0.0,
"Div.Yield":"0,00%",
"P\/Ativo":0.0,
"P\/Cap.Giro":0,
"P\/EBIT":0.0,
"P\/Ativ Circ.Liq":0,
"EV\/EBIT":0.0,
"EV\/EBITDA":0.0,
"Mrg Ebit":"0,00%",
"Mrg. L\u00edq.":"0,00%",
"Liq. Corr.":0,
"ROIC":"0,00%",
"ROE":"12,99%",
"Liq.2meses":"000",
"Patrim. L\u00edq":"9.257.250.00000",
"D\u00edv.Brut\/ Patrim.":0.0,
"Cresc. Rec.5a":"-2,71%"
},
CFLU4 = {
"Cota\u00e7\u00e3o":1.0,
"P\/L":0.0,
"P\/VP":0.0,
"PSR":0.0,
"Div.Yield":"0,00%",
"P\/Ativo":0.0,
"P\/Cap.Giro":0,
"P\/EBIT":0.0,
"P\/Ativ Circ.Liq":0,
"EV\/EBIT":0.0,
"EV\/EBITDA":0.0,
"Mrg Ebit":"8,88%",
"Mrg. L\u00edq.":"10,72%",
"Liq. Corr.":110,
"ROIC":"17,68%",
"ROE":"32,15%",
"Liq.2meses":"000",
"Patrim. L\u00edq":"60.351.00000",
"D\u00edv.Brut\/ Patrim.":6.0,
"Cresc. Rec.5a":"8,14%"
}
]
The enconding is comming wrongly as well.
For example: "Cota\u00e7\u00e3o"
I tried: table.to_json('table7.json',**force_ascii=True**, orient='records', indent=2)
I also tried.
table.to_json('table7.json',**encoding='utf8'**, orient='records', indent=2)
But no success.
So I tried to read with json because the Idea was read it and convert it.
This is the json reader statement.
jasonfile = open('table7.json', 'r')
stocks = jasonfile.read()
jason_object = json.loads(stocks)
print(str(jason_object['Papel']))
But I've got.
**print(str(jason_object['Papel']))
TypeError: list indices must be integers or slices, not str**
Thanks in advance.

You have list with many dictionaries so you have to use index like [0] to get one dictionary
print( jason_object[0]['Papel'] )
And text Cota\u00e7\u00e3o can be correct. It is how JSON keeps native chars.
But if you print it
print('Cota\u00e7\u00e3o')
then you should get
Cotação
When I run
for key in jason_object[0].keys():
print(key)
then I get on screen
VNET3
Papel
Cotação
P/L
P/VP
PSR
Div.Yield
P/Ativo
P/Cap.Giro
P/EBIT
P/Ativ Circ.Liq
EV/EBIT
EV/EBITDA
Mrg Ebit
Mrg. Líq.
Liq. Corr.
ROIC
ROE
Liq.2meses
Patrim. Líq
Dív.Brut/ Patrim.
Cresc. Rec.5a
But if I open table7.json in text editor then I see Cota\u00e7\u00e3o
List [ VNET3 = { .. }] it is not correct JSON nor Python structure.
Correct JSON and Python structure is dictionary { "VNET3": { .. } }
new_data = dict()
for item in jason_object:
key = item['Papel']
item.pop('Papel')
val = item
new_data[key] = val
print(new_data)
Minimal working code
import requests
import pandas as pd
import json
url = 'http://www.fundamentus.com.br/resultado.php'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
response = requests.get(url, headers=headers)
dfs = pd.read_html(response.text)
table = dfs[0]
table.to_json('table7.json', orient='records', indent=2)
jasonfile = open('table7.json', 'r')
jason_object = json.loads(jasonfile.read())
#print(jason_object[0]['Papel'])
#for key in jason_object[0].keys():
# print(key)
new_data = dict()
for item in jason_object:
key = item['Papel']
item.pop('Papel')
val = item
new_data[key] = val
print(new_data)
Tested on Python 3.7, Linux Mint which as default uses UTF-8 in console/terminal.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

CSV file and loop list of URL in Python - python

Seems to me you're using beta instead of Beta. Just fix the capital letter.

Related

Can you scrape multiple stock prices on a recurring schedule?

How to change Json data output in table format

Create Rows and Columns in BeautifulSoup

Yandex Spellchecker API Returns Empty Array

Pandas include Key to json file

Categories

Resources