How to categorize these website as non-working? - python

I have been developing a program which checks whether the website is working or not. I am fetching URLs from the excel sheet and then pasting results as True(Working) & False(Non-working) in the same excel sheet but for some URLs such as http://www.andrewelliotgroup.com/ and https://www.sovranollc.com/, it is showing website working but there is no data on the website. I want to filter these type of websites with no data into Non-working categories. Apart from Status code usage, what else KPI can I use to prove this. Here is the code:
import http.client as httpc
from urllib.parse import urlparse
import pandas as pd
import xlwings as xw
import smtplib
from xlsxwriter import Workbook
from xlutils import copy
import socket
import requests
from bs4 import BeautifulSoup as BS
socket.getaddrinfo('172.23.24.143', 8080)
x=[]
df = pd.read_excel (r'xyz.xlsx')
df1=pd.DataFrame(df,columns=['URL'])
print(df1)
url_list=df["URL"].tolist()
print(url_list)
for i in url_list:
def checkUrl(i):
if 'http' not in i:
i= 'https://'+i
p = urlparse(i)
if (p.scheme == 'http'):
conn = httpc.HTTPConnection(p.netloc,timeout=15)
else:
conn = httpc.HTTPSConnection(p.netloc,timeout=15)
try:
conn.request('HEAD', p.path)
resp = conn.getresponse()
return resp.status<400:
except OSError:
return False
print(checkUrl(i))
x.append(checkUrl(i))
workbook = Workbook('xyz.xlsx')
Report_Sheet = workbook.add_worksheet()
Report_Sheet.write(0, 1, 'Value')
Report_Sheet.write_column(1, 1, x)
Report_Sheet.write(0,0,'URL')
Report_Sheet.write_column(1, 0, url_list)
workbook.close()

Related

Appending Dataframe to another dataframe with first row removed

# Import libs
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# Form Data for passing to the request body
formdata = {'objid': '14'}
# URL
url = "https://www.sec.kerala.gov.in/public/getalllbcmp/byd"
# Query
for i in range(1, 15):
formdata["objid"] = str(i)
response = requests.request("POST", url, data=formdata, timeout=1500)
out = response.content
soup = BeautifulSoup(out,"html.parser")
bat = json.loads(soup.text)
df = pd.DataFrame(bat["ops1"])
df.to_csv(str(i) + ".csv")
Right now this query creates 14 csv files. What I wanted is, the for loop to remove the first row of column headers and append the data to a dataframe I created outside the for loop. so that I can get it as single csv file.
I am using BS and Pandas.
This is one way of achieving your goal:
# Import libs
import pandas as pd
import requests
from tqdm import tqdm ## if using jupyter: from tqdm.notebook import tqdm
final_df = pd.DataFrame()
# URL
url = "https://www.sec.kerala.gov.in/public/getalllbcmp/byd"
# Query
for i in tqdm(range(1, 15)):
formdata = {'objid': i}
r = requests.post(url, data=formdata)
df = pd.json_normalize(r.json()["ops1"])
final_df = pd.concat([final_df, df], axis=0, ignore_index=True)
final_df.to_csv('some_data_saved.csv')
print(final_df)
Data will be saved to a csv file, and also printed in terminal:
100%
14/14 [00:14<00:00, 1.05s/it]
value text
0 8o7LEdvX2e G14001-Kumbadaje
1 jw2XOQyZ4K G14002-Bellur
2 0lMB1O4LbV G14003-Karadka
3 zodLro2Z39 G14004-Muliyar
4 dWxLYn8ZME G14005-Delampady
... ... ...
1029 Qy6Z09bBKE G01073-Ottoor
1030 ywoXG8wLxV M01001-Neyyattinkara
1031 Kk8Xvz7XO9 M01002-Nedumangad
1032 r7eXQYgX8m M01003-Attingal
1033 b3KXlO2B8g M01004-Varkala
1034 rows × 2 columns
Requests can return responses in JSON format, so you don;t need to import bs4 & json.
For TQDM, please see https://pypi.org/project/tqdm/
For pandas documentation, visit https://pandas.pydata.org/docs/
Also for Requests: https://requests.readthedocs.io/en/latest/
I would use a function to get the data and return a DataFrame, then use it within concat:
def get_data(i):
formdata["objid"] = str(i)
response = requests.request("POST", url, data=formdata, timeout=1500)
out = response.content
soup = BeautifulSoup(out,"html.parser")
bat = json.loads(soup.text)
return pd.DataFrame(bat["ops1"])
df = pd.concat([get_data(i) for i in range(1, 15)])
df.to_csv('all_data.csv')
NB. if this gives you unsatisfactory results, please provide a short extract of 2/3 dataframes and the expected merged output.

Create empty dataframes in Google Sheets

I am creating a web scraping program using python, BeautifulSoup, pandas and Google Sheets.
Up until now I have managed to scrape data tables from urls which I’m getting from a list in Google sheets - I have created data frames for each dataset. From my list of urls, some of the cells in the column is empty, which gives me the following error when I try to import the dataframes into another sheet:
MissingSchema: Invalid URL '': No schema supplied. Perhaps you meant
http://?
What I’d like to achieve is, that for every cell that’s empty in the sheets with urls, I would like to create an empty dataframe, just like the ones with data inside them. Is that possible?
My code so far looks like this:
import gspread
from df2gspread import df2gspread as d2g
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
from bs4 import BeautifulSoup
import pandas as pd
import requests
credentials = service_account.Credentials.from_service_account_file(
'credentials.json')
scoped_credentials = credentials.with_scopes(
['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
)
gc = gspread.Client(auth=scoped_credentials)
gc.session = AuthorizedSession(scoped_credentials)
spreadsheet_key = gc.open_by_key('api_key')
# Data import
data_worksheet = spreadsheet_key.worksheet("Data")
# Url's
url_worksheet = spreadsheet_key.worksheet("Urls")
link_list = url_worksheet.col_values(2)
def get_info(linkIndex) :
page = requests.get(link_list[linkIndex])
soup = BeautifulSoup(page.content, 'html.parser')
try :
tbl = soup.find('table')
labels = []
results = []
for tr in tbl.findAll('tr'):
headers = [th.text.strip() for th in tr.findAll('th')]
data = [td.text.strip() for td in tr.findAll('td')]
labels.append(headers)
results.append(data)
final_results = []
for final_labels, final_data in zip(labels, results):
final_results.append({'Labels': final_labels, 'Data': final_data})
df = pd.DataFrame(final_results)
df['Labels'] = df['Labels'].str[0]
df['Data'] = df['Data'].str[0]
indexNames = df[df['Labels'] == 'Links'].index
df.drop(indexNames , inplace=True)
set_with_dataframe(data_worksheet, df, col=(linkIndex*6)+1, row=2,
include_column_header=False)[1:]
except Exception as e:
print(e)
for linkInd in range(len(link_list))[1:] :
get_info(linkInd)
It depends on what do you mean by an empty dataframe. If that's dataframe containing no data, it can be created with statement pd.DataFrame(). If that's dataframe containing np.NaN / None values in same columns as other dataframes, it can be created from a dict:
import pandas as pd
# x is the amount of rows in dataframe
d = {
'column1': [np.NaN] * x,
'column2': [np.NaN] * x,
'column3': [np.NaN] * x
}
df = pd.DataFrame(d)
In the beginning of get_info() function there should be a check added:
if link_list[linkIndex] is not None: # or if link_list[linkIndex] != '' (depending on format of an empty cell)
In if section should be placed already existing logic, in else section an empty dataframe should be created. Function set_with_dataframe() should be called after if / else statement, because it's executed in both cases.

scrape blocks of data from webpage API

I try to collect block data which forms a small table from a webpage. Pls see my codes below.
`
import requests
import re
import json
import sys
import os
import time
from lxml import html,etree
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.investing.com/instruments/OptionsDataAjax'
params = {'pair_id': 525, ## SPX
'date': 1536555600, ## 2018-9-4
'strike': 'all', ## all prices
'callspots': 'calls',#'call_andputs',
'type':'analysis', # webpage viewer
'bringData':'true',
}
headers = {'User-Agent': Chrome/39.0.2171.95 Safari/537.36'}
def R(text, end='\n'): print('\033[0;31m{}\033[0m'.format(text), end=end)
def G(text, end='\n'): print('\033[0;32m{}\033[0m'.format(text), end=end)
page = requests.get(url, params=params,headers = headers)
if page.status_code != 200:
R('ERROR CODE:{}'.format(page.status_code))
sys.exit
G('Problem in connection!')
else:
G('OK')
soup = BeautifulSoup(page.content,'lxml')
spdata = json.loads(soup.text)
print(spdata['data'])`
This result--spdata['data'] gives me a str, I just want to get following blocks in this str. There are many such data blocks in this str with the same format.
SymbolSPY180910C00250000
Delta0.9656
Imp Vol0.2431
Bid33.26
Gamma0.0039
Theoretical33.06
Ask33.41
Theta-0.0381
Intrinsic Value33.13
Volume0
Vega0.0617
Time Value-33.13
Open Interest0
Rho0.1969
Delta / Theta-25.3172
I use json and BeautifulSoup here, maybe regular expression will help but I don't know much about re. To get the result, any approach is appreciated. Thanks.
Add this after your code:
regex = r"((SymbolSPY[1-9]*):?\s*)(.*?)\n[^\S\n]*\n[^\S\n]*"
for match in re.finditer(regex, spdata['data'], re.MULTILINE | re.DOTALL):
for line in match.group().splitlines():
print (line.strip())
Outputs
OK
SymbolSPY180910C00245000
Delta0.9682
Imp Vol0.2779
Bid38.26
Gamma0.0032
Theoretical38.05
Ask38.42
Theta-0.0397
Intrinsic Value38.13
Volume0
Vega0.0579
Time Value-38.13
Open Interest0
Rho0.1934
Delta / Theta-24.3966
SymbolSPY180910P00245000
Delta-0.0262
Imp Vol0.2652
...

why doesnt the below code work for stock option on yahoo finance

this code does not work for yahoo finance of google finance
works for 'http://www.bloomberg.com/markets/currencies/cross-rates'
import pandas as pd
from lxml.html import parse
from urllib2 import urlopen
from pandas.io.parsers import TextParser
def _unpack(row, kind='td'):
elts = row.findall('.//%s' % kind)
return [val.text_content() for val in elts]
def parse_options_data(table):
rows = table.findall('.//tr')
header = _unpack(rows[0], kind='th')
data = [_unpack(r) for r in rows[1:]]
return TextParser(data, names=header).get_chunk()
parsed = parse(urlopen('https://www.google.com/finance/option_chain?q=AAPL&ouput'))
doc = parsed.getroot()
tables = doc.findall('.//table')
table = parse_options_data(tables[1])
table
how do i fix?

Request a html file

This link lets me get a random item from database. However, I would like to automatically retrieve items using Python. Here's my code:
import sys
from urllib.parse import urlencode
from urllib.request import urlopen
# parameters
data = {}
data["query"] = "reviewd:yes+AND+organism:9606"
data["random"] = "yes"
url_values = urlencode(data)
url = "http://www.uniprot.org/uniprot/"
full_url = url + '?' + url_values
data = urlopen(full_url)
out = open("1.html", 'w')
out.write(str(data.read()))
However, I cannot get the desired page. Anyone knows what's wrong with my code? I'm using Python 3.x.
You have several issues:
reviewd is misspelled, it should be reviewed
The base url needs to have /uniprot/ at the end
You need to use space instead of + in your query string
Here is what that would look like:
import sys
from urllib.parse import urlencode
from urllib.request import urlopen
# parameters
data = {}
data["query"] = "reviewed:yes AND organism:9606"
data["random"] = "yes"
url_values = urlencode(data)
url = "http://www.uniprot.org/uniprot/"
full_url = url + '?' + url_values
data = urlopen(full_url)
out = open("1.html", 'w')
out.write(str(data.read()))
This produces the following URL:
http://www.uniprot.org/uniprot/?query=reviewed%3Ayes+AND+organism%3A9606&random=yes

Categories

Resources