Unable to store pandas data frame as a csv

Unable to store pandas data frame as a csv - python

I am following this tutorial to retrieve data from news sites.
The main function is getDailyNews. It will loop on each news source, request the api, extract the data and dump it to a pandas DataFrame and then export the result into csv file.
But when I ran the code, I am getting an error.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from tqdm import tqdm, tqdm_notebook
from functools import reduce
def getSources():
source_url = 'https://newsapi.org/v1/sources?language=en'
response = requests.get(source_url).json()
sources = []
for source in response['sources']:
sources.append(source['id'])
return sources
def mapping():
d = {}
response = requests.get('https://newsapi.org/v1/sources?language=en')
response = response.json()
for s in response['sources']:
d[s['id']] = s['category']
return d
def category(source, m):
try:
return m[source]
except:
return 'NC'
def getDailyNews():
sources = getSources()
key = '96f279e1b7f845669089abc016e915cc'
url = 'https://newsapi.org/v1/articles?source={0}&sortBy={1}&apiKey={2}'
responses = []
for i, source in tqdm_notebook(enumerate(sources), total=len(sources)):
try:
u = url.format(source, 'top', key)
except:
u = url.format(source, 'latest', key)
response = requests.get(u)
r = response.json()
try:
for article in r['articles']:
article['source'] = source
responses.append(r)
except:
print('Rate limit exceeded ... please wait and retry in 6 hours')
return None
articles = list(map(lambda r: r['articles'], responses))
articles = list(reduce(lambda x,y: x+y, articles))
news = pd.DataFrame(articles)
news = news.dropna()
news = news.drop_duplicates()
news.reset_index(inplace=True, drop=True)
d = mapping()
news['category'] = news['source'].map(lambda s: category(s, d))
news['scraping_date'] = datetime.now()
try:
aux = pd.read_csv('./data/news.csv')
aux = aux.append(news)
aux = aux.drop_duplicates('url')
aux.reset_index(inplace=True, drop=True)
aux.to_csv('./data/news.csv', encoding='utf-8', index=False)
except:
news.to_csv('./data/news.csv', index=False, encoding='utf-8')
print('Done')
if __name__=='__main__':
getDailyNews()
Error:
FileNotFoundError: [Errno 2] No such file or directory: './data/news.csv'
I know that I have to give the path name in pd.read_csv but I don't know which path I have to give here.

This error would make sense if there wasn't already a data folder in the directory you are executing this program from. There is a similar problem in the post here.

Related

How to get amazon's bestsellers category and all child nodes

I made a python function to get all of the categories and their child node until the last one. I want the output to be like this: {'https://www.amazon.ae/gp/bestsellers/appliances/': ['Heating And Cooling', 'https://www.amazon.ae/gp/bestsellers/appliances/12134072031']['Air Conditioners', 'https://www.amazon.ae/gp/bestsellers/kitchen/15298093031']['Cabinet Air Conditioners', 'https://www.amazon.ae/gp/bestsellers/kitchen/15298093031']
My code:
import requests
from bs4 import BeautifulSoup as bs
import time
from tqdm import tqdm
_seen_categories = []
def crawl(url):
r = requests.get(url)
time.sleep(2)
s = bs(r.text, "html.parser")
try:
treeitems = s.find("span", class_="_p13n-zg-nav-tree-all_style_zg-selected__1SfhQ").find_next("div", {"role": "group"}).find_all("div", {"role": "treeitem"})
except:
treetiems = None
fullDict = []
for treeitem in tqdm(treeitems):
a = treeitem.find_next("a")
d = {url:[a.text.strip(), a["href"]]}
fullDict.append(d)
print(a.text.strip())
print(a["href"])
if treeitems is not None:
next_url = "https://www.amazon.ae"+a['href']
try:
if next_url not in _seen_categories:
crawl(next_url)
except:
pass
else:
_seen_categories.append(next_url)
time.sleep(2)
crawl("https://www.amazon.ae/gp/bestsellers/appliances")
This function is not formatting as expected. Need help to complete this.

How to get URL from two dropdown lists (webscraping with python)

I want to webscrape this webpage (www.autocar.co.uk). Therefore, I want to select each car manufacturer in a drop down menu and the model to get the HREF/reference to the model website and then retrieve some information from each model page (not reflected in the code yet)
As I just started coding I would higly appreciate your input! Thanks in advance!! :)
Desired output:
https://www.autocar.co.uk/car-review/abarth/595
https://www.autocar.co.uk/car-review/abarth/595-competizione
https://www.autocar.co.uk/car-review/abarth/124-spider-2016-2019
https://www.autocar.co.uk/car-review/abarth/695-biposto-2015-2016
https://www.autocar.co.uk/car-review/ac-schnitzer/acs3-sport
https://www.autocar.co.uk/car-review/ac-schnitzer/acs1
https://www.autocar.co.uk/car-review/ac-schnitzer/acs5-sport
https://www.autocar.co.uk/car-review/allard/j2x-mkii
https://www.autocar.co.uk/car-review/alfa-romeo/giulia
https://www.autocar.co.uk/car-review/alfa-romeo/tonale
Output as of now --> we need to remove the "https://www.autocar.co.uk0":
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/abarth/595
https://www.autocar.co.uk/car-review/abarth/595-competizione
https://www.autocar.co.uk/car-review/abarth/124-spider-2016-2019
https://www.autocar.co.uk/car-review/abarth/695-biposto-2015-2016
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/ac-schnitzer/acs3-sport
https://www.autocar.co.uk/car-review/ac-schnitzer/acs1
https://www.autocar.co.uk/car-review/ac-schnitzer/acs5-sport
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/allard/j2x-mkii
https://www.autocar.co.uk0
https://www.autocar.co.uk/car-review/alfa-romeo/giulia
https://www.autocar.co.uk/car-review/alfa-romeo/tonale
Code as of now:
from bs4 import BeautifulSoup
import requests
import pandas as pd
#Inputs/URLs to scrape:
url = "http://www.autocar.co.uk/"
s = requests.Session()
r = s.get(url)
soup = BeautifulSoup(r.text,'html.parser')
full_car_list = []
car_list = [(x.text, x.get("value"), f'https://www.autocar.co.uk/ajax/car-models/{x.get("value")}/0') for x in soup.select_one('#edit-make').select('option')]
for x in car_list:
r = s.get(x[2])
try:
for item in r.json()['options'].items():
#Car Model
car_model_url = (f'https://www.autocar.co.uk{item[0]}')
print(car_model_url)
except Exception as e:
full_car_list.append((x[0], 'no models', f'https://www.autocar.co.uk/vehicles/{x[0]}'))

You'll want to refactor things into a couple of functions for clarity; that also makes it easier to skip data that isn't valid (apparently occasionally you'd get a list from the ajax/car-models API):
from bs4 import BeautifulSoup
import requests
sess = requests.Session()
def get_make_info():
resp = sess.get("http://www.autocar.co.uk/")
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
for option in soup.select('#edit-make option'):
make_id = option['value']
yield (make_id, option.text)
def get_make_models(make_id):
info_url = f'https://www.autocar.co.uk/ajax/car-models/{make_id}/0'
resp = sess.get(info_url)
resp.raise_for_status()
data = resp.json()
options = data['options']
if isinstance(options, list): # Invalid format, skip
return
for model_url, model_name in options.items():
if model_url == "0": # "All models"
continue
model_url = f'https://www.autocar.co.uk{model_url}'
yield (model_url, model_name)
for make_id, make_name in get_make_info():
for model_url, model_name in get_make_models(make_id):
print(make_id, make_name, model_url, model_name)

Using the code as written for your previous question, all you have to do is print out the 'Url' column of the dataframe:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.autocar.co.uk/"
s = requests.Session()
r = s.get(url)
soup = BeautifulSoup(r.text,'html.parser')
full_car_list = []
car_list = [(x.text, x.get("value"), f'https://www.autocar.co.uk/ajax/car-models/{x.get("value")}/0') for x in soup.select_one('#edit-make').select('option')]
for x in car_list:
r = s.get(x[2])
try:
for item in r.json()['options'].items():
full_car_list.append((x[0], item[1], f'https://www.autocar.co.uk{item[0]}'))
except Exception as e:
full_car_list.append((x[0], 'no models', f'https://www.autocar.co.uk/vehicles/{x[0]}'))
cars_df = pd.DataFrame(full_car_list[1:], columns = ['Make', 'Model', 'Url'])
cars_df = cars_df[cars_df.Model != 'All models']
cars_df.to_csv('makes_models.csv')
for x in cars_df.Url.tolist():
print(x)

Apply the code for smaller batches in the data set sequentially

I have data set of retrieved tweets via the Twitter streaming API.
However, I regularly want to be updated about how the public metrics change. Therefore, I wrote a code to request those public metrics:
def create_url():
tweet_fields = "tweet.fields=public_metrics"
tweets_data_path = 'dataset.txt'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
df = pd.DataFrame.from_dict(pd.json_normalize(tweets_data), orient='columns')
df_id = (str(str((df['id'].tolist()))[1:-1])).replace(" ", "")
ids = "ids=" + df_id
url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
return url
def bearer_oauth(r):
r.headers["Authorization"] = f"Bearer {'AAAAAAAAAAAAAAAAAAAAAN%2B7QwEAAAAAEG%2BzRZkmZ4HGizsKCG3MkwlaRzY%3DOwuZeaeHbeMM1JDIafd5riA1QdkDabPiELFsguR4Zba9ywzzOQ'}"
r.headers["User-Agent"] = "v2TweetLookupPython"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {}".format(
response.status_code, response.text
)
)
return response.json()
def main():
url = create_url()
json_response = connect_to_endpoint(url)
print(json.dumps(json_response, indent=3, sort_keys=True))
if __name__ == "__main__":
main()
Unfortunately, my data set has more than 100 id's in it and I want to retrieve the metrics for all of them. As I can only request 100 id's at a time, can you maybe help me on how to do that?
Also, I would like to make the request daily at midnight and then store the file in a txt document, maybe you can also help me with that?

You can chunk your data and send it in batches using itertools.islice.
test.py:
import reprlib
from itertools import islice
import pandas as pd
BASE_URL = "https://api.twitter.com/2/tweets"
CHUNK = 100
def req(ids):
tmp = reprlib.repr(ids) # Used here just to shorten the output
print(f"{BASE_URL}?ids={tmp}")
def main():
df = pd.DataFrame({"id": range(1000)})
it = iter(df["id"])
while chunk := tuple(islice(it, CHUNK)):
ids = ",".join(map(str, chunk))
req(ids)
if __name__ == "__main__":
main()
Test:
$ python test.py
https://api.twitter.com/2/tweets?ids='0,1,2,3,4,5,...5,96,97,98,99'
https://api.twitter.com/2/tweets?ids='100,101,102,...6,197,198,199'
https://api.twitter.com/2/tweets?ids='200,201,202,...6,297,298,299'
https://api.twitter.com/2/tweets?ids='300,301,302,...6,397,398,399'
https://api.twitter.com/2/tweets?ids='400,401,402,...6,497,498,499'
https://api.twitter.com/2/tweets?ids='500,501,502,...6,597,598,599'
https://api.twitter.com/2/tweets?ids='600,601,602,...6,697,698,699'
https://api.twitter.com/2/tweets?ids='700,701,702,...6,797,798,799'
https://api.twitter.com/2/tweets?ids='800,801,802,...6,897,898,899'
https://api.twitter.com/2/tweets?ids='900,901,902,...6,997,998,999'
Note: You'll make multiple requests with this approach so keep in mind any rate limits.

Request API with multi-threading in python

How can I request the following API with multi-threading in python? Or any ways that can request API more effective? Multi-processing? AsyncIO?
import pandas as pd
import requests
import json
def getAPIreturn(row):
para = row['para']
url = f"http://localhost/search?name={para}"
try:
return_json = json.loads(requests.get(url).json)
data = return_json['data']
score = data[0]['score']
count = data[1]['count']
status = data[2]['status']
except:
score = None
count = None
status = None
row['score'] = score
row['count'] = count
row['status'] = status
return row
data = pd.read_csv("input.csv")
data = data.apply(getAPIreturn, axis=1)
data.to_csv("output.csv", index=False)

Parsing stock recommended rating from Yahoo stock site

I'm looking to parse a specific Yahoo stock page using a Python script (take https://finance.yahoo.com/quote/NOA?ltr=1 for example) and print the "Recommended Rating" to a file. Recommended rating can be found on the right hand side of the page about half way down.
This is what I have so far
try:
import urllib.request as urllib2
except ImportError:
import urllib2
from bs4 import BeautifulSoup
quote_page = 'https://finance.yahoo.com/quote/NOA?ltr=1'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, "html.parser")
name_box = soup.find(attrs={'div': 'rating-text Arrow South Fw(b) Bgc($strongBuy) Bdtc($strongBuy)'})
name = name_box.text.strip()
print(name)
The tricky part is that I believe the recommended rating is only listed on the page as InnerHTML. I'm not sure how i'd go about retrieving this data, a push in the right direction would be greatly appreciated!

Yahoo makes a get request to the url in the script below for some of their data. If you look in the network tab of the developer tools and refresh the page for NOA stock you should see 'NOA?formatt...'. Click this and then view the response object to see some of the data. You'll need the requests module for the script below to work: pip install requests.
# get_mean_recs.py
import csv
from datetime import datetime
import requests
import sys
get_date = lambda : datetime.utcnow().strftime('%d-%m-%Y')
lhs_url = 'https://query2.finance.yahoo.com/v10/finance/quoteSummary/'
rhs_url = '?formatted=true&crumb=swg7qs5y9UP&lang=en-US&region=US&' \
'modules=upgradeDowngradeHistory,recommendationTrend,' \
'financialData,earningsHistory,earningsTrend,industryTrend&' \
'corsDomain=finance.yahoo.com'
def get_mean_rec(ticker):
url = lhs_url + ticker + rhs_url
r = requests.get(url)
if not r.ok:
return -1
result = r.json()['quoteSummary']['result'][0]
return result['financialData']['recommendationMean']['fmt']
def read_from_csv(fn):
with open(fn, 'r') as f:
reader = csv.reader(f)
for line in reader:
for ticker in line:
yield ticker
def write_to_csv(fn, data):
with open(fn, 'a') as f:
fieldnames = data[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
for item in data:
writer.writerow(item)
def assemble_dict(ticker):
return {
'ticker': ticker,
'mean_rec': get_mean_rec(ticker),
'utc_date': get_date()
}
def main():
in_fn = sys.argv[1]
out_fn = sys.argv[2]
data = [assemble_dict(ticker) for ticker in read_from_csv(in_fn)]
write_to_csv(out_fn, data)
if __name__ == '__main__':
main()
Usage:
python get_mean_recs.py input.csv output.csv

There is an API for accessing the yahoo finance information, e.g.
http://finance.yahoo.com/d/quotes.csv?s=NOA&f=snd1l1yr
I think you may be better off using that to fetch the required information. Some more info on the parameters can be found here:
http://wern-ancheta.com/blog/2015/04/05/getting-started-with-the-yahoo-finance-api/

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Unable to store pandas data frame as a csv - python

This error would make sense if there wasn't already a data folder in the directory you are executing this program from. There is a similar problem in the post here.

Related

How to get amazon's bestsellers category and all child nodes

How to get URL from two dropdown lists (webscraping with python)

Apply the code for smaller batches in the data set sequentially

Request API with multi-threading in python

Parsing stock recommended rating from Yahoo stock site

Categories

Resources