Read content of internet page with Python

Read content of internet page with Python - python

I am playing with Python and trying to parse internet page in order to automate my football watching process on Amazon Fire Tv.
I produced follow code to read HTML pages by URL:
from httplib import BadStatusLine
import urllib2
import logging
htmlWorker = html_worker.HtmlWorkerLiveFootball()
htmlWorker.get_list_of_matches(htmlWorker.URL)
class HtmlWorkerLiveFootball:
URL = 'http://livefootball.ws/'
def get_list_of_matches(self, url):
opener = urllib2.OpenerDirector()
for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler, HTTPMethodFallback, HEADRedirectHandler,
urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
opener.add_handler(handler())
opener.addheaders = [('User-agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36'
)]
urllib2.install_opener(opener)
try:
logging.warning("request = %s" % opener.addheaders)
page = urllib2.urlopen(url)
logging.warning("result = %s" % page.read())
except urllib2.HTTPError, error:
logging.error("error code = %d" % error.code)
except BadStatusLine:
logging.error("could not fetch %s" % url)
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, headers, newurl):
if code in (301, 302, 303, 307):
logging.warning("redirect_request = %d" % code)
newurl = newurl.replace(' ', '%20')
logging.warning("new url = %s" % newurl)
logging.warning("headers = %s" % headers)
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
logging.debug("newheaders = %s" % newheaders)
request = HeadRequest(newurl, headers=newheaders, origin_req_host=req.get_origin_req_host(),
unverifiable=True)
request.add_header('User-agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36')
request.add_header('Cookie', headers.dict['set-cookie'])
request.add_header('Host', "livefootball.ws")
request.add_header('Accept-Encoding', "gzip,deflate,sdch")
request.add_header('Accept', "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
request.add_header('Cache-Control', "max-age=0")
request.add_header('Accept-Language', "en-US,en;q=0.8,ru;q=0.6")
logging.warning("request = %s" % request.headers)
return request
else:
raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
class HTTPMethodFallback(urllib2.BaseHandler):
def http_error_405(self, req, fp, code, msg, headers):
logging.warning("http_error_405. Headers = %s" % headers)
fp.read()
fp.close()
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
return self.parent.open(urllib2.Request(req.get_full_url(),
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True))
It works for major amount of site all over internet, but unfortunately looks like site that I need trying to avoid DDOS attack with some unfamiliar for me mechanism (redirect + some stuff with cookies). I trying to emulate browser behavior, but have empty string at the end.
Here log that I have after executing this code:
WARNING:root:request = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36')]
WARNING:root:redirect_request = 307
WARNING:root:new url = http://livefootball.ws/?dos=1
WARNING:root:headers = Server: nginx
Date: Sun, 15 Jun 2014 14:11:03 GMT
Content-Type: text/html
Content-Length: 180
Connection: close
Set-Cookie: antid=6abeccafd9ac44951b4acc7f642649b7; path=/
Location: http://livefootball.ws/?dos=1
WARNING:root:request = {'Accept-language': 'en-US,en;q=0.8,ru;q=0.6', 'Accept-encoding': 'gzip,deflate,sdch', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36', 'Host': 'livefootball.ws', 'Cookie': 'antid=6abeccafd9ac44951b4acc7f642649b7; path=/', 'Cache-control': 'max-age=0'}
WARNING:root:result =
How to read this page with python? Thanks.

If you want to read HTML pages by URL, you can use requests library instead urllib2. It is so easy to use:
import requests
session = requests.Session()
index_url = 'http://livefootball.ws/'
index_request = session.get(index_url)
#change encoding of the response
index_request.encoding = 'CP1251'
#print page content
print index_request.text

Related

#Python Web_Scraping_Linkedin_ User Incomplete result

Please I need some support from you all. It’s a practice python code used for scarping employee information/user URL from Linkedin, this code can currently only print those user name and their current position, however, the showing result is also incomplete (Some of them are just their name without the role in the company) In the end, the user URL could not printenter image description here out.
import random
import argparse
import requests
import re
parser = argparse.ArgumentParser(description='Searches Google For Linkedin Profiles')
parser.add_argument('--keyword', type=str, help='keywords to search')
parser.add_argument('--limit', type=int, help='how many profiles to scrape')
args = parser.parse_args()
class LinkedinScraper(object):
def __init__(self, keyword, limit):
#:param keyword: a str of keyword(s) to search for
#:param limit: number of profiles to scrape
self.keyword = keyword.replace(' ', '%20')
self.all_htmls = ""
self.server = 'www.google.com'
self.quantity = '100'
self.limit = int(limit)
self.counter = 0
def search(self):
#perform the search
#:return: a list of htmls from Google Searches
# choose a random user agent
user_agents = [
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.19 (KHTML, like Gecko) Ubuntu/11.10 Chromium/18.0.1025.142 Chrome/18.0.1025.142 Safari/535.19',
'Mozilla/5.0 (Windows NT 5.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.00'
]
while self.counter < self.limit:
headers = {'User-Agent': random.choice(user_agents)}
url = 'http://google.com/search?num=100&start=' + str(self.counter) + '&hl=en&meta=&q=site%3Alinkedin.com/in%20' + self.keyword
resp = requests.get(url, headers=headers)
if ("Our systems have detected unusual traffic from your computer network.") in resp.text:
print("Running into captchas")
return
self.all_htmls += resp.text
self.counter += 10
def parse_links(self):
reg_links = re.compile("url=https://www.linkedin.com(.*?)&")
self.temp = reg_links.findall(self.all_htmls)
results = []
for regex in self.temp:
final_url = regex.replace("url= ", "")
results.append("https://www.linkedin.com" + final_url)
return results
def parse_people(self):
# :param html: parse the html for Linkedin Profiles using regex
# :return: a list of
reg_people = re.compile(r'>[a-zA-Z0-9._ -]* -|\| LinkedIn')
self.temp = reg_people.findall(self.all_htmls)
print(self.temp)
results = []
for iteration in (self.temp):
delete = iteration.replace(' | LinkedIn', '')
delete = delete.replace(' - LinkedIn', '')
delete = delete.replace(' profiles ', '')
delete = delete.replace('LinkedIn', '')
delete = delete.replace('|', '')
delete = delete.replace('"', '')
delete = delete.replace('>', '')
delete = delete.strip("-")
if delete != " ":
results.append(delete)
return results
if __name__ == "__main__":
ls = LinkedinScraper(keyword="Tesla",limit=100)
ls.search()
links = ls.parse_links()
print(links)
profiles = ls.parse_people()
print(*profiles,sep="\n")

How would I use multithreading an api request like such

I created a username checker against the Ubisoft api. However the requests are fairly slow so I wanted to speed it up, and one thing I thought of was multithreading. I know about pools and such but I've got no clue how to use it in an api request like here.
def check():
global checkedCount
global availableCount
headers = {
'Method':'GET',
'Authority':'public-ubiservices.ubi.com',
'referer':'https://lb-prod-acc_ount-pdc.ubisoft.com',
'Ubi-AppId':'c5393f10-7ac7-4b4f-90fa-21f8f3451a04',
'Authorization': authToken,
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'Ubi-RequestedPlatformType':'uplay'}
for name in usernameList:
r = requests.get("https://public-ubiservices.ubi.com/v3/profiles?nameOnPlatform=" + name + "&platformType=uplay", headers=headers)
while r.status_code != 200: #retry on rate limit
r = requests.get("https://public-ubiservices.ubi.com/v3/profiles?nameOnPlatform=" + name + "&platformType=uplay", headers=headers)
if not r.json()['profiles']:
availableCount += 1
checkedCount += 1
print(f"{Fore.CYAN}[$]{Fore.RESET} {name} is available")
else:
checkedCount += 1
print(f"{Fore.CYAN}[$]{Fore.RESET} {name} is unavailable")
Don't say it's a duplicate question. Because I'm not trying to use multiple url's like other questions.

Python/Json Code - Where was this variable located

This code is from a separate submission.
If you look at the lines :
each[AuthorString]
each[Title]
im wondering where the user got these variables from?
I navigated to the json page
Link
and could not find these variables? maybe im in the wrong page? Screenshots will help
here is the code
import requests
session_ids = ['13619' ,'13736']
for session_id in session_ids:
url = 'https://cdn-solr.asco.org/solr/ml/mlselect'
payload = '?_format=json&wt=json&indent=true&q=SessionId:' + session_id + '&start=0&rows=30&sort=score%20desc,%20SessionId%20asc&fq=RecordType:sessions&facet=true&f.Year.facet.sort=index&facet.field={!key=Year}Year&facet.field={!key=subject_thes}subject_thes&facet.field={!key=MediaTypes}MediaTypes&facet.field={!key=fctSessionType}fctSessionType&facet.pivot={!key=MeetingName}fctMeetingName,fctTrack&spellcheck.maxCollationTries=100'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
jsonData = requests.get(url+payload, headers=headers).json()
sessionParticipationID = jsonData['response']['docs'][0]['SessionParticipationID']
session_id_list = '%20OR%20'.join(sessionParticipationID)
payload = '?_format=json&wt=json&indent=true&sort=PresentationOrderWithinSession%20asc,%20ISODateString%20asc,%20ISODateStringEnd%20asc&fl=_id,%20score,%20ISODateString,%20ISODateStringEnd,%20ISODateString_1,%20ISODateStringEnd_1,%20Year,%20Title,%20tempAbstractID,%20MediaID,%20VideoID,%20EdBookID,%20edBookTitle,%20PosterID,%20edBookTitle,%20SessionTitle,%20SessionTypeId,%20AuthorString,%20AbstID,%20Role,%20FullName,%20PosterBoard,%20Institution,%20ProgramTitle,%20MeetingName,%20FirstAuthor&q=_id:(' + session_id_list + ')&rows=' + str(len(sessionParticipationID))
jsonData = requests.get(url+payload, headers=headers).json()
title_auth = [] #<-- to make a list of {title:author} dictionary
for each in jsonData['response']['docs']:
title = each['Title'] #this line
author = each['AuthorString'] #and this

Scrape/Extract Skype IDs from Google

So basically, websites have their Skype id on their website in this format: Skype ID: USERNAMEWOULDBEHERE or Skype: USERNAMEWOULDBEHERE
I'm just trying to extract their usernames/Skype ID.
Am I doing anything wrong? How would I check for both strings? (Skype: & Skype ID:)
Help is much appreciated. I'm a beginner in Python so please go easy with me lol.
#!/usr/bin/env python2
# -*- coding: utf8 -*-
import sys
import time
import random
import argparse
from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.common.exceptions import NoSuchFrameException
from selenium.webdriver.common.keys import Keys
# If this script no longer fetches any results check the XPath
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--search', help='Enter the search term')
parser.add_argument('-p', '--pages', default='1', help='Enter how many pages to scrape (1 page = 100 results)')
return parser.parse_args()
def start_browser():
br = webdriver.Firefox()
br.implicitly_wait(10)
return br
def get_ua():
ua_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0']
ua = random.choice(ua_list)
return ua
def scrape_results(br):
links = br.find_elements_by_xpath("Skype ID: ")
results = []
for link in links:
title = link.text.encode('utf8')
url = link.get_attribute('href')
title_url = (title, url)
results.append(title_url)
return results
def go_to_page(br, page_num, search_term):
page_num = page_num - 1
start_results = page_num * 100
start_results = str(start_results)
url = 'https://www.google.com/webhp?#num=100&start='+start_results+'&q='+search_term
print '[*] Fetching 100 results from page '+str(page_num+1)+' at '+url
br.get(url)
time.sleep(2)
def main():
args = parse_args()
br = start_browser()
if not args.search:
sys.exit("[!] Enter a term or phrase to search with the -s option: -s 'dan mcinerney'")
search_term = args.search
pages = args.pages
all_results = []
for page_num in xrange(int(pages)):
page_num = page_num+1 # since it starts at 0
go_to_page(br, page_num, search_term)
titles_urls = scrape_results(br)
for title in titles_urls:
all_results.append(title)
for result in all_results:
title = result[0]
url = result[1]
print '[+]', title, '--', url
br.quit()
if __name__ == "__main__":
main()

Python 3 : HTTP Error 405: Method Not Allowed

I'm getting 'HTTP Error 405: Method Not Allowed' error. My code is
import urllib.request
import urllib.parse
try:
url = 'https://www.google.com/search'
values = {'q': 'python programming tutorials'}
data = urllib.parse.urlencode(values)
data = data.encode('utf-8') # data should be bytes
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
req = urllib.request.Request(url, data, headers = headers)
resp = urllib.request.urlopen(req)
print("HERE")
respData = resp.read()
saveFile = open('withHeaders.txt', 'w')
saveFile.write(str(respData))
saveFile.close()
except Exception as e:
print(e)
The error I guess is in req = urllib.request.Request(url, data, headers = headers). What is the error, syntactical? What should be changed in code? And any conceptual mistake do correct me.
EDIT
Concept:
def URLRequest(url, params, method="GET"):
if method == "POST":
return urllib2.Request(url, data=urllib.urlencode(params))
else:
return urllib2.Request(url + "?" + urllib.urlencode(params))

You can use Requests library instead. It's much cleaner than urllib
import requests
q = 'Whatever you want to search'
url = 'https://www.google.com/search'
response = requests.get(url+'?'+'q='+q)
saveFile = open('response.txt', 'w')
savefile.write(response.text)
savefile.close()
Or if you want to stick to the urllib , you can do this:
import urllib.request
url = 'https://www.google.com/search'
q = 'Search Query'
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
request = urllib.request.Request(url+'?'+'q='+q, headers=headers)
response = urllib.request.urlopen(request).read() # the text of the response is here
saveFile = open('withHeaders.txt', 'w')
saveFile.write(str(response))
saveFile.close()

Here in reference to www.pythonforbeginners
# Importing the module
import urllib.request
# your search text
text="hi google"
# Define the url
url = 'http://www.google.com/#q='+text
# Add your headers
headers = {'User-Agent' : 'Mozilla 5.10'}
# Create the Request.
request = urllib.request.Request(url, None, headers)
# Getting the response
response = urllib.request.urlopen(request)
# Print the headers
print (response.read())

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Read content of internet page with Python - python

Related

#Python Web_Scraping_Linkedin_ User Incomplete result

How would I use multithreading an api request like such

Python/Json Code - Where was this variable located

Scrape/Extract Skype IDs from Google

Python 3 : HTTP Error 405: Method Not Allowed

Categories

Resources