Python | Selenium cannot detect button - python

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
import os
Game_Pin = input('Enter your PIN: ')
NickNAME = input('Enter your nickname: ')
driver = webdriver.Chrome(executable_path=r"C:\WebDriver\bin\chromedriver.exe")
def Enter_Press(driver):
driver.find_element_by_xpath("//*[contains(text(), 'Enter')]").click()
def OK_GO(driver):
driver.find_element_by_xpath("//*[contains(text(), 'OK, go!')]").click()
def Kahoot_Spammer(Game_Pin, NickNAME, driver):
driver.get('https://kahoot.it/')
driver.maximize_window() #For maximizing window
driver.implicitly_wait(2) #gives an implicit wait for 2 seconds
game_pin = driver.find_element_by_xpath("//*[#id='inputSession']")
game_pin.send_keys(Game_Pin)
Enter_Press(driver)
driver.implicitly_wait(2)
Name = driver.find_element_by_xpath("//*[#id='username']")
Name.send_keys(NickNAME)
OK_GO(driver)
Kahoot_Spammer(Game_Pin, NickNAME, driver)
This is the code. Its supposed to open a chrome browser and navigate to the Kahoot.it website. Then take what information you gave it and put it in for you. It works for the first part of entering a game but once it gets to create your nickname it cannot detect the OK, go! button.
driver.find_element_by_xpath("//*[contains(text(), 'OK, go!')]").click()
I've inspected the button but cannot seem to find what to put within the code above. Any ideas?
Here is the source code.
<!doctype html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9" lang="en"> <![endif]-->
<!--[if IE 9]> <html class="no-js lt-ie10" lang="en"> <![endif]-->
<!--[if gt IE 9]><!--> <html class="no-js" lang="en"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>Kahoot!</title>
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1.0, minimum-scale=1.0"/>
<meta name="viewport" content="initial-scale=1, maximum-scale=1.0, minimum-scale=1.0" media="(device-height: 568px)"/>
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-itunes-app" content="app-id=1131203560">
<meta name="description" content="Join a game of kahoot here. Kahoot! is a free game-based learning platform that makes it fun to learn – any subject, in any language, on any device, for all ages!">
<meta name="keywords" content="education, platform, smart phone, tablet, mobile, social, inclusive, HTML5, classroom, engagement, play, game, fun, quiz, multi-player, pedagogy, learning model, learn, gamification." />
<link rel="shortcut icon" href="/shared/theme/kahoot/img/icons/favicon.ico">
<link rel="apple-touch-icon-precomposed" sizes="144x144" href="/shared/theme/kahoot/img/icons/touch_icon_144.png">
<link rel="apple-touch-icon-precomposed" sizes="114x114" href="/shared/theme/kahoot/img/icons/touch_icon_114.png">
<link rel="apple-touch-icon-precomposed" sizes="72x72" href="/shared/theme/kahoot/img/icons/touch_icon_72.png">
<link rel="apple-touch-icon-precomposed" href="/shared/theme/kahoot/img/icons/touch_icon_57.png">
<link rel="stylesheet" type="text/css" href="/shared/css/cloak.css">
<div style="height: 0; width: 0; position: absolute; visibility: hidden">
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><defs><filter x="-2.2%" y="-2.3%" width="104.4%" height="104.8%" filterUnits="objectBoundingBox" id="a"><feOffset dy="1" in="SourceAlpha" result="shadowOffsetOuter1"/><symbol id="logo-shapes" viewBox="0 0 24 24"><ellipse cx="5.506" cy="18.966" rx="4.953" ry="4.953"/><path d="M12.005 5.902L17.873.033l5.869 5.869-5.869 5.868zm1.443 8.899h8.849v8.849h-8.849zm-2.584-4.977H.146l5.36-8.555z"/></symbol></svg>
</div>
<script src="https://tap-nexus.appspot.com/js/sdk/kahunaAPI_min.js"></script>
<script type="text/javascript">
(function(e,t){var n=e.amplitude||{_q:[],_iq:{}};var r=t.createElement("script");r.type="text/javascript";
r.async=true;r.src="https://d24n15hnbwhuhn.cloudfront.net/libs/amplitude-3.4.0-min.gz.js";
r.onload=function(){e.amplitude.runQueuedFunctions()};var i=t.getElementsByTagName("script")[0];
i.parentNode.insertBefore(r,i);function s(e,t){e.prototype[t]=function(){this._q.push([t].concat(Array.prototype.slice.call(arguments,0)));
return this}}var o=function(){this._q=[];return this};var a=["add","append","clearAll","prepend","set","setOnce","unset"];
for(var u=0;u<a.length;u++){s(o,a[u])}n.Identify=o;var c=function(){this._q=[];return this;
};var p=["setProductId","setQuantity","setPrice","setRevenueType","setEventProperties"];
for(var l=0;l<p.length;l++){s(c,p[l])}n.Revenue=c;var d=["init","logEvent","logRevenue","setUserId","setUserProperties","setOptOut","setVersionName","setDomain","setDeviceId","setGlobalUserProperties","identify","clearUserProperties","setGroup","logRevenueV2","regenerateDeviceId","logEventWithTimestamp","logEventWithGroups"];
function v(e){function t(t){e[t]=function(){e._q.push([t].concat(Array.prototype.slice.call(arguments,0)));
}}for(var n=0;n<d.length;n++){t(d[n])}}v(n);n.getInstance=function(e){e=(!e||e.length===0?"$default_instance":e).toLowerCase();
if(!n._iq.hasOwnProperty(e)){n._iq[e]={_q:[]};v(n._iq[e])}return n._iq[e]};e.amplitude=n;
})(window,document);
</script>
<base href="/">
<script type="text/javascript">
document.write('<scri'+'pt ');
document.write('type="text/javascript" ');
document.write('src="'+'/shared/theme/config.js');
document.write("?"+new Date().getTime()+'">');
document.write('</scri'+'pt>');
</script>
</head>
<body snitch ios7-viewport-fix>
<noscript>
<h1>Kahoot! needs JavaScript to work</h1>
<p>
To use Kahoot!, you need to have JavaScript enabled in your browser. To enable JavaScript, please do the following:
</p>
<ul>
<li>Follow these instructions.</li>
<li>Make sure you have the latest browser.</li>
<li>Turn off or disable the NoScript extension, if you have it.</li>
<li>Contact your IT administrator to allow access to Kahoot! in your security preferences.</li>
</ul>
<p>If you continue to have problems, please let us know by contacting Kahoot! support.</p>
</noscript>
<div id="debug-info" debug-info="dev,test" debug-timestamp></div>
<dev-mode></dev-mode>
<div class="loader" loader></div>
<iframe
id="gameBlockIframe"
style="display:none;"
class="game-block-iframe"
sandbox="allow-scripts allow-same-origin"
scrolling="no">
</iframe>
<div id="mainView" ng-cloak ng-view>
<h1>Join in a Kahoot! here</h1>
<p>To learn more about Kahoot! visit kahoot.com</p>
</div>
<div ng-cloak alerts></div>
<script type="text/javascript" src="/js/bootstrap.js"></script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
function gup( name, url ) {
if (!url) url = location.href;
name = name.replace(/[\[]/,"\\\[").replace(/[\]]/,"\\\]");
var regexS = "[\\?&]"+name+"=([^&#]*)";
var regex = new RegExp( regexS );
var results = regex.exec( url );
return results == null ? null : results[1];
}
var clientId = gup('gaId', window.location.search);
if (clientId) {
ga('create', 'UA-35308575-1', 'auto', {'allowLinker': true, 'clientId':gup('gaId', window.location.search)});
ga('create', 'UA-35308575-4', 'auto', {'name': 'legacy', 'clientId':gup('gaId', window.location.search)});
var platform = gup('platform', window.location.search);
if (typeof platform === 'string' && platform == 'iOS') {
window.ga('set', 'appName', 'Kahoot');
window.ga('set', 'appId', 'no.mobitroll.kahoot.controller');
}
if (typeof platform === 'string' && platform == 'Android') {
window.ga('set', 'appName', 'Kahoot');
window.ga('set', 'appId', 'no.mobitroll.kahoot.android');
}
} else {
ga('create', 'UA-35308575-1', 'auto', {'allowLinker': true});
ga('create', 'UA-35308575-4', 'auto', {'name': 'legacy'});
}
ga('send', 'pageview');
ga('legacy.send', 'pageview');
</script>
</body>
</html>

implicitly_wait needs to only be declared once when driver is initialized. To explicitly wait a specified number of seconds, you may use time.sleep(), although the more practical solution is just dynamically wait for the element to be present/clickable using Selenium's WebDriverWait.

Related

can't able to login whalewisdom website using BeautifulSoup

I'm trying to login whalewisdom website for last two week but I'm not able to log in, I was tried many libraries like scrapy, selenium, beautifulsoup, etc...
from requests import Session
from bs4 import BeautifulSoup as bs
with Session() as s:
login_url = s.get("https://whalewisdom.com/login")
bs_content = bs(login_url.content, "lxml")
authenticity_token = bs_content.find("input", {"name":"authenticity_token"})["value"]
login_data = {
"authenticity_token": authenticity_token,
"login": "info#example.com",
"password": "***********",
"commit": "Log+In",
}
s.post("https://whalewisdom.com/session", data=login_data)
html_data = bs(s.get("https://whalewisdom.com/dashboard").content, "html.parser")
print(html_data)
enter image description here
Here the outputenter image description here:
<!DOCTYPE html>
<html lang="en">
<head>[enter image description here][1]
<meta charset="utf-8"/>
<title>WhaleWisdom Dashboard</title>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1.0" name="viewport"/>
<meta content="WhaleWisdom tracks 13F, Schedule 13D, and 13G EDGAR filings by hedge funds. Hedge Fund Whale Backtesting and search tools" name="description"/>
<link href="https://d27mjrcvcy56qq.cloudfront.net/images/apple-touch-icon-76x76.png" rel="apple-touch-icon" sizes="76x76"/>
<link href="https://d27mjrcvcy56qq.cloudfront.net/images/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="https://d27mjrcvcy56qq.cloudfront.net/images/favicon-96x96.png" rel="icon" sizes="96x96" type="image/png"/>
<link href="https://d27mjrcvcy56qq.cloudfront.net/images/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<meta content="r4hQnHlN2H-GtcIb06YHl49VSipApmfQQWIOvZzfnAU" name="google-site-verification">
<link href="https://fonts.googleapis.com/css?family=Roboto:100,300,400,500,700|Material+Icons" rel="stylesheet" type="text/css"/>
<link href="https://cdn.jsdelivr.net/npm/font-awesome#4.7.0/css/font-awesome.min.css" rel="stylesheet"/>
<link href="https://d27mjrcvcy56qq.cloudfront.net/packs/css/whalewisdom-24fbc382.css" media="screen" rel="stylesheet">
<meta content="authenticity_token" name="csrf-param">
<meta content="XMAu/LK+dKi/zt/XSTvxIJ8jKl2x8Rx47/ZnAiN6MQCcZmSSlUrOLMeURRr54eCfEWHY8oyS8c6GYxLoIMomNQ==" name="csrf-token">
</meta></meta></link></meta></head>
<body>
<noscript>
<strong>We're sorry but the WhaleWisdom Dashboard doesn't work properly without JavaScript enabled. Please enable it to continue.</strong>
</noscript>
<div id="app"></div>
<script src="https://d27mjrcvcy56qq.cloudfront.net/packs/js/whalewisdom-4b32da19479fdebf5332.js"></script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-11651599-1', 'auto');
ga('send', 'pageview');
</script>
<script async="" charset="utf-8" src="//ads.investingchannel.com/adtags/WhaleWisdom/quotepages/970x91.js" type="text/javascript"></script>
</body>
</html>
You can see that in the HTML output, at line 23, there is an error type message stating that the WhaleWisdom dashboard doesn't work properly without JavaScript.
<!DOCTYPE html>
<html lang="en">
<head>[enter image description here][1]
<meta charset="utf-8"/>
<title>WhaleWisdom Dashboard</title>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1.0" name="viewport"/>
<meta content="WhaleWisdom tracks 13F, Schedule 13D, and 13G EDGAR filings by hedge funds. Hedge Fund Whale Backtesting and search tools" name="description"/>
<link href="https://d27mjrcvcy56qq.cloudfront.net/images/apple-touch-icon-76x76.png" rel="apple-touch-icon" sizes="76x76"/>
<link href="https://d27mjrcvcy56qq.cloudfront.net/images/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="https://d27mjrcvcy56qq.cloudfront.net/images/favicon-96x96.png" rel="icon" sizes="96x96" type="image/png"/>
<link href="https://d27mjrcvcy56qq.cloudfront.net/images/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<meta content="r4hQnHlN2H-GtcIb06YHl49VSipApmfQQWIOvZzfnAU" name="google-site-verification">
<link href="https://fonts.googleapis.com/css?family=Roboto:100,300,400,500,700|Material+Icons" rel="stylesheet" type="text/css"/>
<link href="https://cdn.jsdelivr.net/npm/font-awesome#4.7.0/css/font-awesome.min.css" rel="stylesheet"/>
<link href="https://d27mjrcvcy56qq.cloudfront.net/packs/css/whalewisdom-24fbc382.css" media="screen" rel="stylesheet">
<meta content="authenticity_token" name="csrf-param">
<meta content="XMAu/LK+dKi/zt/XSTvxIJ8jKl2x8Rx47/ZnAiN6MQCcZmSSlUrOLMeURRr54eCfEWHY8oyS8c6GYxLoIMomNQ==" name="csrf-token">
</meta></meta></link></meta></head>
<body>
----
<noscript>
<strong>We're sorry but the WhaleWisdom Dashboard doesn't work properly without JavaScript enabled. Please enable it to continue.</strong>**
</noscript>
----
<div id="app"></div>
<script src="https://d27mjrcvcy56qq.cloudfront.net/packs/js/whalewisdom-4b32da19479fdebf5332.js"></script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-11651599-1', 'auto');
ga('send', 'pageview');
</script>
<script async="" charset="utf-8" src="//ads.investingchannel.com/adtags/WhaleWisdom/quotepages/970x91.js" type="text/javascript"></script>
</body>
</html>
I think because of this it is not working. I also can't test it right now because I don't use WhaleWisdom.

Python Requests Get not pulling webpage

I have a list of UPC code and I am trying to write a script to pull information about them form https://www.barcodelookup.com but the request is returning only the html tags but none of the information I want.
Here is a sample of my code:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://www.barcodelookup.com/075610166101')
soup = BeautifulSoup(page.text, 'html.parser')
bsoup = soup.prettify()
with open('output1.html', 'w') as file:
file.write(str(bsoup))
with open('output.html', 'w')as file:
file.write(str(page.text))
sample outpout.html1:
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]> <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]> <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-US">
<!--<![endif]-->
<head>
<title>
Attention Required! | Cloudflare
</title>
<meta id="captcha-bypass" name="captcha-bypass"/>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=Edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="noindex, nofollow" name="robots"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="/cdn-cgi/styles/cf.errors.css" id="cf_styles-css" media="screen,projection" rel="stylesheet" type="text/css"/>
<!--[if lt IE 9]><link rel="stylesheet" id='cf_styles-ie-css' href="/cdn-cgi/styles/cf.errors.ie.css" type="text/css" media="screen,projection" /><![endif]-->
<style type="text/css">
body{margin:0;padding:0}
</style>
<!--[if gte IE 10]><!-->
<script src="/cdn-cgi/scripts/zepto.min.js" type="text/javascript">
</script>
<!--<![endif]-->
<!--[if gte IE 10]><!-->
<script src="/cdn-cgi/scripts/cf.common.js" type="text/javascript">
</script>
<!--<![endif]-->
<style type="text/css">
#cf-wrapper #spinner {width:69px; margin: auto;}
#cf-wrapper #cf-please-wait{text-align:center}
.attribution {margin-top: 32px;}
.bubbles { background-color: #f58220; width:20px; height: 20px; margin:2px; border-radius:100%; display:inline-block; }
#cf-wrapper #challenge-form { padding-top:25px; padding-bottom:25px; }
#cf-hcaptcha-container { text-align:center;}
</style>
</head>
<body>
<div id="cf-wrapper">
<div class="cf-alert cf-alert-error cf-cookie-error" data-translate="enable_cookies" id="cookie-alert">
Please enable cookies.
</div>
<div class="cf-error-details-wrapper" id="cf-error-details">
<div class="cf-wrapper cf-header cf-error-overview">
<h1 data-translate="challenge_headline">
One more step
</h1>
<h2 class="cf-subheadline">
<span data-translate="complete_sec_check">
Please complete the security check to access
</span>
www.barcodelookup.com
</h2>
</div>
sample outpou1.html:
<div class="cf-section cf-highlight cf-captcha-container">
<div class="cf-wrapper">
<div class="cf-columns two">
<div class="cf-column">
<div class="cf-highlight-inverse cf-form-stacked">
<form action="/075610166101?__cf_chl_captcha_tk__=10080e641441171d59b24657ed37a7381be4a368-1595778921-0-AS91JaY_1ozqjwuL0cLJj39tDQ8tO-5t6vMnZ4LFD6V9L_k_jFw1qb6NW_KOPGyf53pazgUHKpjsBF0oCu3pWy-n1rks1eGTzPNdPJvDUgly5EfmCU2hfkPgF0u9Mmb0jAt0uNra1wy-xDgG87ZgWd3KvYSj1Jre0DtwvkXITbLAaAdSg5UeBhw4DDEuCxFILAwhLTU3YHEm9F1CbC7cqA-U05kTDiOIBnZngHGBrnOWB9LYl6asezmwfpuzNZTovixMVE8BBKVfIf1gJjllYh7626I1abfYw38uuoIy0viPuN_CtjB8JoBbs2qrix4gXW6PGu9EA5ZPhBw-IQ8csPLN-a0WFRqB3Il-Hz6M6z9Wdb-OHUKOjX37n_fBuQarqU34cgbG4CNpD_7cdn_NUrlJ6xsRZiFV13V2q4zBS4XpPwabA_unBIjziYgIiB-y9hwndtV08bMXxtoSqtNxxev3fNnL_cQ" class="challenge-form" enctype="application/x-www-form-urlencoded" id="challenge-form" method="POST">
<input name="r" type="hidden" value="33260f1c9e17bb57e0d89a1d21e050da58f9c0a0-1595778921-0-Ad2sk2X3qN2WwWLekQkZpeJCOg0H0bI9CHDtAranzrOjQHfchnqyW9dHD3S6CpbKRRrV/9pFNY+jLG7XUks78zi0PsNBHSNwDV4ad2liittfYU5X73GgFmyN3COYAQomUPoPxw+YPyMTRPrR0P6qFUh92fhmLMbivztY8iwFFTppCHO1Kx8Ax+4orJWgb31sJpRrtuasqpgFs9qCAhBgBKzue/BginjozYpNbGDlrdjnWnh+b+SxL+HWxzkFLwxoIWDJ6dMHaZSp/zvBptO5cgBTpPupAYNvcB2O3YGapY0UefpxmhXntG50yXyrQmobqrh4rjuyXgDup3HO8ETKUwnZ37f4NN0LuYA2k9nveVh0j9hqy/P09wbQE8AChLs2/u2uqpTcGyPSpbTOyNo1FjfD+BpE6KqQsL8l9hOtHuHviayTngoqOrOMW6"/>
<input name="cf_captcha_kind" type="hidden" value="h"/>
<input name="vc" type="hidden" value=""/>
<script async="" data-ray="5b8f4df52fe3741d"
I am trying to post both output file to show the returned information but the system won't let me.
Websites usually put some security mechanisms in order to avoid getting scraped. The most basic check is serving content based on a user-agent so if a requesting client is not sharing any user-agent information it will be considered as an unsupported browser or some bot/script. So, just adding a user-agent header parameter (mimicking Google Chrome) is allowing us to get content from this site.
Here is your updated script:
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
page = requests.get('https://www.barcodelookup.com/075610166101', headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
bsoup = soup.prettify()
with open('output1.html', 'w') as file:
file.write(str(bsoup))
with open('output.html', 'w')as file:
file.write(str(page.text))

Python beautifulsoup print does not print whole html page

I am scraping a website that has a pagination in it. I was testing the loop and print the output in it from beautifulsoup. When the results are printed, I noticed that the result is not a complete html text. It only includes the first part of the html. Here are my code
from bs4 import BeautifulSoup
import requests
import time
total_pages = 2295
for i in range(1,total_pages,1):
pageNumber = str(i)
url = requests.get("https://www.propertyguru.com.sg/property-for-sale/"+pageNumber+"?order=desc&property_type=N&property_type_code%5B0%5D=CONDO&property_type_code%5B1%5D=APT&property_type_code%5B2%5D=WALK&property_type_code%5B3%5D=CLUS&property_type_code%5B4%5D=EXCON&sort=date").text
soup = BeautifulSoup(url,'html.parser')
print(soup.prettify())
When i print soup.prettify() the result is this
<!DOCTYPE doctype html>
<!--[if gt IE 9]><!-->
<html class="no-js is-new-brand" lang="en">
<!--<![endif]-->
<head>
<title>
</title>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="app-id=482524585" name="apple-itunes-app">
<meta content="app-id=com.allproperty.android.consumer.sg" name="google-play-app">
<meta content="9iVXbwdOPHOH_byBFBScAHm5x-kvcPzBS_fJBFPBwbo" name="google-site-verification">
<meta content="46acd457be6effa0" name="y_key"/>
<meta content="893837EF69C47405FBAFAB120889A598" name="msvalidate.01"/>
<link href="/images/is-new-brand-favicon.ico" rel="SHORTCUT ICON"/>
<link href="/search.xml" rel="search" title="PropertyGuru Search" type="application/opensearchdescription+xml"/>
<link href="https://cdn.pgimgs.com/1574318624/sf2-search/bundles/guruweblayout/img/is-new-brand-touch-logo.png" rel="apple-touch-icon"/>
<link href="https://cdn.pgimgs.com/1574318624/sf2-search/bundles/guruweblayout/img/is-new-brand-touch-logo.png" rel="android-touch-icon"/>
<script>
// check for browsers without complete flex support ( < IE 10)
window.onload = function(e){
if(Function('/*#cc_on return document.documentMode<=10#*/')()) {
window.location = '/ie-notsupported';
}
};
</script>
<link href="//cdn1.pgimgs.com/1574318624/sg-static/cssprod/propertyguru/layout.css" rel="stylesheet" type="text/css"/>
<link href="//cdn1.pgimgs.com/1574318624/sg-static/cssprod/propertyguru/sg.css" rel="stylesheet" type="text/css"/>
<link href="//cdn1.pgimgs.com/1574318624/sg-static/cssprod/propertyguru/new_styles.css" rel="stylesheet" type="text/css"/>
<script src="//cdn1.pgimgs.com/1574318624/sg-static/jsprod/lib/modernizr-custom.min.js" type="text/javascript">
</script>
<script src="//cdn1.pgimgs.com/1574318624/sg-static/jsprod/jquery-1.12.3.min.js" type="text/javascript">
</script>
<script type="text/javascript">
var guruApp = {"environment":null,"widgetSearch":null,"widgetPoll":null,"widgetGoogleAnalytics":{"dimensions":{"dimension3":"Production","dimension4":"en","dimension13":"SG","dimension14":"web"},"googleAnalyticsObject":null,"config":{"trackingId":"UA-2417512-2","cookieDomain":"propertyguru.com.sg","siteSpeedSampleRate":10}},"userSession":{"user":{"id":null,"username":null,"roles":null,"shortlist":0,"beta":false}},"isResponsive":"false","identityEndpoint":"https:\/\/identity.propertyguru.com\/identity","defaultCurrency":"SGD","googleMaps":{"key":"AIzaSyBlCo7kpcBszvIZoH709avg1rmUjjiop0k"},"googleApis":{"key":"367223124563-is5hdjeal1rr7og4i8ii7t8imihr1dg1.apps.googleusercontent.com"}};
</script>
<link href="https://fonts.googleapis.com/css?family=Roboto:400,500" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Nunito:600" rel="stylesheet" type="text/css"/>
<!--[if gt IE 8]><!-->
<link href="https://cdn.pgimgs.com/1574318624/sf2-search/css/legacy_css.css" rel="stylesheet" type="text/css">
<link href="//cdn1.pgimgs.com/1574318624/sg-static/cssprod/rich/fixes.css" rel="stylesheet" type="text/css">
<!--<![endif]-->
<script type="text/javascript">
<!--
var GMAP_KEY = "AIzaSyCUbmYAT3lyhBvao9Yg-WsKtRbMxO-VvVQ";
var REGION = "SG";
var images = [];
var freetextUrl = '//api.propertyguru.com/v1/autocomplete?limit=10&locale=en&format=csv_legacy&region=sg&objectType=HDB_ESTATE,DISTRICT,PROPERTY,STREET,MRT_STATION,SCHOOL';
//-->
</script>
<!-- GOOGLE AD MANAGER -->
<div class="clearboth">
</div>
<!-- Begin comScore Tag -->
<script>
var _comscore = _comscore || [];
_comscore.push({ c1: "2", c2: "13151479" });
(function() {
var s = document.createElement("script"), el = document.getElementsByTagName("script")[0]; s.async = true;
s.src = (document.location.protocol == "https:" ? "https://sb" : "http://b") + ".scorecardresearch.com/beacon.js";
el.parentNode.insertBefore(s, el);
})();
</script>
<noscript>
<img src="https://sb.scorecardresearch.com/p?c1=2&c2=13151479&cv=2.0&cj=1"/>
</noscript>
<!-- End comScore Tag -->
<!-- GOOGLE ANALYTICS CODE -->
<script src="https://cdn.pgimgs.com/1574318624/sf2-search/bundles/guruweblayout/js/desktop/logger.js" type="text/javascript">
</script>
<script src="https://cdn.pgimgs.com/1574318624/sf2-search/bundles/guruweblayout/js/fingerprint2.min.js" type="text/javascript">
</script>
<script src="https://cdn.pgimgs.com/1574318624/sf2-search/bundles/guruwidget/js/desktop/jquery.widgetGoogleAnalytics.js" type="text/javascript">
</script>
<!-- Google Analytics -->
<script type="text/javascript">
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
</script>
<script type="text/javascript">
if (typeof guruApp != 'undefined' && typeof guruApp.widgetGoogleAnalytics != 'undefined' && guruApp.widgetGoogleAnalytics.googleAnalyticsObject != null) {
guruApp.widgetGoogleAnalytics.googleAnalyticsObject.init();
}
</script>
<script src="https://cdn.pgimgs.com/1574318624/sf2-search/bundles/guruweblayout/js/desktop/jquery.eventDispatcher.js" type="text/javascript">
</script>
<script type="text/javascript">
$(document).ready(function () {
var $body = $('body'),
track = function(category, action, label, value, noninteraction, dimensions) {
label = cleanText(label);
guruApp.widgetGoogleAnalytics.googleAnalyticsObject.trackEvent(category, action, label, value, noninteraction, dimensions);
},
cleanText = function(str) {
return str.replace(/^https?:\/\/[^\/]+/, '').replace(/^\s+/, '').replace(/\s+$/, '').replace(/\s+/, ' ');
};
$body.find('.dropdown .dropdown-menu li.mainnav-areainsider').click(function () {
$body.trigger('ga.mainnav.areainsider.click');
});
});
</script>
<!-- ELOQUA TRACKING CODE -->
<script type="text/javascript">
var _elqQ = _elqQ || [];
_elqQ.push(['elqSetSiteId', '659351510']);
_elqQ.push(['elqTrackPageView']);
(function () {
function async_load() {
var s = document.createElement('script'); s.type = 'text/javascript'; s.async = true;
s.src = '//img03.en25.com/i/elqCfg.min.js';
var x = document.getElementsByTagName('script')[0]; x.parentNode.insertBefore(s, x);
}
if (window.addEventListener) window.addEventListener('DOMContentLoaded', async_load, false);
else if (window.attachEvent) window.attachEvent('onload', async_load);
})();
</script>
<script defer="" src="/pg186791.js" type="text/javascript">
</script>
<style type="text/css">
#d__fFH{position:absolute;top:-5000px;left:-5000px}#d__fF{font-family:serif;font-size:200px;visibility:hidden}#weeawqsxdstyxxvz{display:none!important}
</style>
</link>
</link>
</meta>
</meta>
</meta>
</head>
<body class="web_filter_recaptcha SG-web_filter_recaptcha layout-web lang-en app-sg legacy is-new-brand" id="web_filter_recaptcha">
<div id="wrapper-outer">
<div id="wrapper">
<div id="wrapper-inner">
<div class="alert alert-warning" id="gdpr-alert" role="alert" style="margin-bottom: 0; display:none;">
To comply with GDPR we will not store any personally identifiable information from you. Therefore we will serve sub-optimal experience where some features such as Login/Signup are disabled. However, you will be able to search and see all the properties, see agent contact details and contact them offline on your own.
</div>
<header class="navbar navbar-default" id="navbar-main">
<div class="header-bg">
<div class="container">
<nav class="header-nav clearfix" role="navigation">
<div class="navbar-header">
<button class="navbar-toggle" type="button">
<span class="sr-only">
Toggle navigation
</span>
<i class="pgicon pg
<!DOCTYPE doctype html>
<!--[if gt IE 9]><!-->
<html class="no-js is-new-brand" lang="en">
<!--<![endif]-->
<head>.....AND SO ON AND SO FOURTH
It only prints some contents but not the whole html contents.
You are using requests library, so it does not loads the javascripts. This website is using API to populate the data which use javascript.
You should try using selenium. Selenium will load the whole page with javascript. Then read the page_source and use beautifulsoup.
Beautiful-soup library extracts only the view-source of an web page.
Ex:(view-source:https://www.propertyguru.com.sg/property-for-sale/1?order=desc&property_type=N&property_type_code%5B0%5D=CONDO&property_type_code%5B1%5D=APT&property_type_code%5B2%5D=WALK&property_type_code%5B3%5D=CLUS&property_type_code%5B4%5D=EXCON&sort=date)
Beautiful-soup library is working fine..
from bs4 import BeautifulSoup
import requests
import time
total_pages = 2295
for i in range(1,total_pages,1):
pageNumber = str(i)
url = requests.get("https://www.propertyguru.com.sg/property-for-sale/"+pageNumber+"?order=desc&property_type=N&property_type_code%5B0%5D=CONDO&property_type_code%5B1%5D=APT&property_type_code%5B2%5D=WALK&property_type_code%5B3%5D=CLUS&property_type_code%5B4%5D=EXCON&sort=date").text
soup = BeautifulSoup(url,'html.parser')
print(soup.prettify())

Web Scraping - sessionStorage distil_referrer

I am trying to find a good way to get into a website, but I have found a big issue.
What is returning is the header of the html file, where there are those lines of code:
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml"><head>
<meta name="ROBOTS" content="NOINDEX, NOFOLLOW" />
<meta http-equiv="cache-control" content="max-age=0" />
<meta http-equiv="cache-control" content="no-cache" />
<meta http-equiv="expires" content="0" />
<meta http-equiv="expires" content="Tue, 01 Jan 1980 1:00:00 GMT" />
<meta http-equiv="pragma" content="no-cache" />
<meta http-equiv="refresh" content="10; url=/distil_r_blocked.html? Ref=/de/company/TI&distil_RID=F180AC0A-ECA2-11E6-80AC- E678F530D985&distil_TID=20170206193206" />
<script type="text/javascript">
(function(window){
try {
if (typeof sessionStorage !== 'undefined'){
sessionStorage.setItem('distil_referrer', document.referrer);
}
} catch (e){}
})(window);
</script>
<script type="text/javascript" src="/ga625684.js" defer="defer"></script><style type="text/css">#d__fFH{position:absolute;top:-5000px;left:-5000px}#d__fF{font-family:serif;font-size:200px;visibility:hidden}#svevtdraftsrybvfradvq{display:none!important}</style></head>
<body>
<div id="distil_ident_block"> </div>
<div id="d__fFH" style="position: absolute; top: -5000px; left: -5000px;"><object id="d_dlg" classid="clsid:3050f819-98b5-11cf-bb82-00aa00bdce0b" width="0px" height="0px"></object><span id="d__fF" style="font-family: ZWAdobeF,serif; font-size: 72px; visibility: hidden;">mmmmmmmmlli</span></div></body></html>
</script>
Is there a workaround to actually get the webpage?
For now i am trying to use Selenium without results
The code is the following:
from selenium import webdriver
fp = webdriver.FirefoxProfile()
driver = webdriver.Firefox(firefox_profile=fp)
driver.get("https://www.moneyhouse.ch/de/company/TI")
print driver.page_source

Browserless web scraping of Vaadin based website in Python

I'm new with web scraping and I encountered a problem.
I tried to extract the list of the states from this site, 'https://www.iso.org/obp/ui/#iso:code:3166:JP', by using Python, selenium and PhantomJS but I failed with the output as below.
<!DOCTYPE html><html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=11;chrome=1">
<style type="text/css">html, body {height:100%;margin:0;}</style>
<link rel="shortcut icon" type="image/vnd.microsoft.icon" href="./../VAADIN/themes/obp/favicon.ico">
<link rel="icon" type="image/vnd.microsoft.icon" href="./../VAADIN/themes/obp/favicon.ico">
<link rel="stylesheet" type="text/css" href="./../VAADIN/themes/obp/styles.css"><script type="text/javascript" src="./../VAADIN/widgetsets/org.iso.obp.ui.widgetset.applicationWidgetset/org.iso.obp.ui.widgetset.applicationWidgetset.nocache.js?1444641834593"></script><script src="https://www.iso.org/obp/VAADIN/widgetsets/org.iso.obp.ui.widgetset.applicationWidgetset/913365F3A38F531CF0D09D8744F3A155.cache.js"></script></head>
<body scroll="auto" class=" v-generated-body">
<div id="obpui-105541713" class=" v-app obp">
<div class=" v-app-loading"></div>
<noscript>
You have to enable javascript in your browser to use an application built with Vaadin.
</noscript>
</div>
<script type="text/javascript" src="./../VAADIN/vaadinBootstrap.js"></script>
<script type="text/javascript">//<![CDATA[
if (!window.vaadin) alert("Failed to load the bootstrap javascript: ./../VAADIN/vaadinBootstrap.js");
vaadin.initApplication("obpui-105541713",{"heartbeatInterval":300,"versionInfo":{"vaadinVersion":"7.3.10"},"vaadinDir":"./../VAADIN/","authErrMsg":{"message":"Take note of any unsaved data, and <u>click here<\/u> or press ESC to continue.","caption":"Authentication problem"},"widgetset":"org.iso.obp.ui.widgetset.applicationWidgetset","theme":"obp","comErrMsg":{"message":"Take note of any unsaved data, and <u>click here<\/u> or press ESC to continue.","caption":"Communication problem"},"serviceUrl":".","standalone":true,"sessExpMsg":{"message":"Take note of any unsaved data, and <u>click here<\/u> or press ESC key to continue.","caption":"Session Expired"}});
//]]></script>
</body></html>
My code in Python is here.
from selenium import webdriver
target_url = 'https://www.iso.org/obp/ui/#iso:code:3166:JP'
driver = webdriver.PhantomJS()
driver.get( target_url)
print driver.page_source
Is there any solution for this?

Categories

Resources