Python script to extract a javascript variable from html on a page - python

I have the following javascript in the header of pages on my site:
<script type='text/javascript'>
var gaProperty = 'UA-00000000-1';
var disableStr = 'ga-disable-' + gaProperty;
if ( document.cookie.indexOf( disableStr + '=true' ) > -1 ) {
window[disableStr] = true;
}
function gaOptout() {
document.cookie = disableStr + '=true; expires=Thu, 31 Dec 2099 23:59:59 UTC; path=/';
window[disableStr] = true;
}
</script>
Im trying to extract the var gaProperty from each page (i.e UA-00000000-1) in a list of url's in a csv file using python. Im new to python and put together a script from bits of scripts ive seen around but it doesnt work:
from requests_html import HTMLSession
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
import re
list = []
with open('list.csv','r') as csvf: # Open file in read mode
urls = csv.reader(csvf)
for url in urls:
list.append(url) # Add each url to list contents
for url in list:
page = urlopen(url[0]).read()
path = " ".join(url)
soup = BeautifulSoup(page, "lxml")
data = soup.find_all('script', type='text/javascript')
gaid = re.search(r'UA-[0-9]+-[0-9]+', data[0].text)
print(path, gaid)
The incorrect result i get is:
https:www.example.com/contact-us/ None
I need to achieve this desired output for each url:
https:www.example.com/contact-us/ UA-00000000-1
Any idea how to get this working in Python?

I would include the var gaProperty in the pattern, to be more specific, then ensure the capture group is lazily capturing everything between the ' that follow i.e. wrap the gaid value.
import re
html ='''
<script type='text/javascript'>
var gaProperty = 'UA-00000000-1';
var disableStr = 'ga-disable-' + gaProperty;
if ( document.cookie.indexOf( disableStr + '=true' ) > -1 ) {
window[disableStr] = true;
}
function gaOptout() {
document.cookie = disableStr + '=true; expires=Thu, 31 Dec 2099 23:59:59 UTC; path=/';
window[disableStr] = true;
}
</script>'''
gaid = re.search(r"var gaProperty = '(.*?)'", html).group(1)
print(f'https:www.example.com/contact-us/{gaid}')

Related

Scraping dynamic website by Python Selenium

I try to scrape dynamic website by BS4 python:
https://www.nadlan.gov.il/?search=%D7%AA%D7%9C%20%D7%90%D7%91%D7%99%D7%91%20%20%D7%99%D7%A4%D7%95
I tried:
from urllib.request import urlopen
from bs4 import BeautifulSoup
page = urlopen(wiki)
soup = BeautifulSoup("https://www.nadlan.gov.il/?search=תל אביב יפו")
I have 2 problems:
The website is dynamic, when I view the page source, I don't see the page content only JavaScript scripts:
<script>
document.write("<script src='scripts/dis/bundleJS.js?v=" + globalAppVersion + "'><\/script>")
document.write("<script id='srcGovmap' src='https://new.govmap.gov.il/govmap/api/govmap.api.js?v='" + globalAppVersion + "'><\/script>")
document.write("<script src='MainLoader.js?v=" + globalAppVersion + "'><\/script>")
document.write("<script id='tld-search-srcipt'
src='https://www.nadlan.gov.il/TldSearch/Scripts/ac.js?v=" + globalAppVersion + "'><\/script>");
</script>
<script src="scripts/dis/accessibility/b1.js?v=3" type="text/javascript"></script>
<script type="text/javascript">
accessibility_rtl = true;
pixel_from_side = 20;
pixel_from_start = 15;
$(document).ready(function () {
$('#accessibility_icon').attr('src', 'images/accessibility_icon.png')
$('.accessibility_div_wrap>.btn_accessibility > span.accessibility_component').html('')
});
When I open the site it takes a few seconds for the data to load:
How can I solve these problems by Selenium?
The data is loaded dynamically via JavaScript. You can simulate the Ajax calls with requests/json modules. For example:
import json
import requests
url = 'https://www.nadlan.gov.il/Nadlan.REST/Main/GetAssestAndDeals'
data = {"MoreAssestsType":0,"FillterRoomNum":0,"GridDisplayType":0,"ResultLable":"תל אביב -יפו","ResultType":1,"ObjectID":"5000","ObjectIDType":"text","ObjectKey":"UNIQ_ID","DescLayerID":"SETL_MID_POINT","Alert":None,"X":180428.31832654,"Y":665726.5550939,"Gush":"","Parcel":"","showLotParcel":False,"showLotAddress":False,"OriginalSearchString":"תל אביב יפו","MutipuleResults":False,"ResultsOptions":None,"CurrentLavel":2,"Navs":[{"text":"מחוז תל אביב - יפו","url":None,"order":1}],"QueryMapParams":{"QueryToRun":None,"QueryObjectID":"5000","QueryObjectType":"number","QueryObjectKey":"SETL_CODE","QueryDescLayerID":"KSHTANN_SETL_AREA","SpacialWhereClause":None},"isHistorical":False,"PageNo":1,"OrderByFilled":"DEALDATETIME","OrderByDescending":True,"Distance":0}
result = requests.post(url, json=data).json()
# uncomment this to print all data:
# print(json.dumps(result, indent=4))
# print all results to screen:
for r in result['AllResults']:
for k, v in r.items():
print('{:<30} {}'.format(k, v))
print('-' * 80)
Prints:
DEALDATE 12.12.2020
DEALDATETIME 2020-12-12T00:00:00
FULLADRESS
DISPLAYADRESS
GUSH 7104-289-264
DEALNATUREDESCRIPTION דירה
ASSETROOMNUM 3
FLOORNO None
DEALNATURE 90
DEALAMOUNT 3,650,000
NEWPROJECTTEXT 1
PROJECTNAME מגדלי גינדי תל אביב
BUILDINGYEAR None
YEARBUILT
BUILDINGFLOORS None
KEYVALUE 10812534855
TYPE 2
POLYGON_ID 7104-289
TREND_IS_NEGATIVE False
TREND_FORMAT
--------------------------------------------------------------------------------
DEALDATE 31.07.2020
DEALDATETIME 2020-07-31T00:00:00
FULLADRESS עגנון ש"י 28, תל אביב -יפו
DISPLAYADRESS עגנון ש"י 28
GUSH 6634-336-33
DEALNATUREDESCRIPTION דירה
ASSETROOMNUM 5
FLOORNO None
DEALNATURE 130
DEALAMOUNT 6,363,000
NEWPROJECTTEXT 1
PROJECTNAME הפילהרמונית
BUILDINGYEAR 2020
YEARBUILT
BUILDINGFLOORS 9
KEYVALUE 10812534851
TYPE 1
POLYGON_ID 6634-336
TREND_IS_NEGATIVE False
TREND_FORMAT
--------------------------------------------------------------------------------
...and so on.

How to extract var (values) from <script> of html using beautifulsoup

i am currently using
import requests
from bs4 import BeautifulSoup
source = requests.get('www.randomwebsite.com').text
soup = BeautifulSoup(source,'lxml')
details= soup.find('script')
this is returning me the following script.
<script>
var Url = "https://www.example.com";
if(Url != ''){code}
else {code
}
</script>
i want to have the output as following.
https://www.example.com
import re
text = """
<script>
var Url = "https://www.example.com";
if(Url != ''){code}
else {code
}
</script>
"""
match = re.search('Url = "(.*?)"', text)
print(match.group(1))
Output:
https://www.example.com
To print the cashback_url, you can try this script:
import re
import requests
url = 'https://tracking.earnkaro.com/visitretailer/508?id=103894&shareid=ENKR2020090345700421&dl=https%3A%2F%2Fwww.amazon.in%2Fgp%2Fproduct%2FB08645RXJ6%2Fref%3Dox_sc_act_title_1%3Fsmid%3DAT95IG9ONZD7S%26psc%3D1'
html_data = requests.get(url).text
cashback_url = re.search(r'var cashbackUrl = "(.*?)"', html_data).group(1)
print(cashback_url)
Prints:
https://www.amazon.in/gp/product/B08645RXJ6/ref=ox_sc_act_title_1?smid=AT95IG9ONZD7S&psc=1&ck&tag=EK003221-21

How to get a particular script tag?

I am trying to build a download manager script with python, The web page contains some script tags, i want to isolate a particular script, the script html5player.setVideoUrlHigh('https://*****');,
I don't know how to go about it, I was able to get all the script tags but i am unable to get the script tag with this code html5player.setVideoUrlHigh('https://*****');
Here is my python code
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
Url = '*****'
pg = urlopen(Url)
sp = BeautifulSoup(pg)
script_tag = sp.find_all('script')
# print(script_tag[1])
print(re.search("setVideoHLS\(\'(.*?)\'\)", script_tag).group(1))
The script tag i want to get is this:
<script>
logged_user = false;
var static_id_cdn = 17;
var html5player = new HTML5Player('html5video', '56420147');
if (html5player) {
html5player.setVideoTitle('passionate hotel room');
html5player.setSponsors(false);
html5player.setVideoUrlLow('https://*****');
html5player.setVideoUrlHigh('https://******');
html5player.setVideoHLS('https://****');
html5player.setThumbUrl('https://**');
html5player.setStaticDomain('***');
html5player.setHttps();
html5player.setCanUseHttps();
document.getElementById('html5video').style.minHeight = '';
html5player.initPlayer();
}
How can I get parameter from this function `html5player.setVideoUrlHigh('https://******').
You can get the script tag using this code,
import re
from bs4 import BeautifulSoup
html = """<script> logged_user = false;
var static_id_cdn = 17;
var html5player = new HTML5Player('html5video', '56420147');
if (html5player) {
html5player.setVideoTitle('passionate hotel room');
html5player.setSponsors(false);
html5player.setVideoUrlLow('https://*****');
html5player.setVideoUrlHigh('https://******');
html5player.setVideoHLS(''https://****');
html5player.setThumbUrl('https://**');
html5player.setStaticDomain('***');
html5player.setHttps();
html5player.setCanUseHttps();
document.getElementById('html5video').style.minHeight = '';
html5player.initPlayer();
}</script>"""
soup = BeautifulSoup(HTML)
txt = soup.script.get_text()
print(txt)
Output:
logged_user = false;
var static_id_cdn = 17;
var html5player = new HTML5Player('html5video', '56420147');
if (html5player) {
html5player.setVideoTitle('passionate hotel room');
html5player.setSponsors(false);
html5player.setVideoUrlLow('https://*****');
html5player.setVideoUrlHigh('https://******');
html5player.setVideoHLS(''https://****');
html5player.setThumbUrl('https://**');
html5player.setStaticDomain('***');
html5player.setHttps();
html5player.setCanUseHttps();
document.getElementById('html5video').style.minHeight = '';
html5player.initPlayer();
}
EDIT
import requests
import bs4
import re
url = 'url'
r = requests.get(url)
bs = bs4.BeautifulSoup(r.text, "html.parser")
scripts = bs.find_all('script')
src = scripts[7] #Needed script is in position 7
print(re.search("html5player.setVideoUrlHigh\(\'(.*?)\'\)", str(src)).group(1))

How to solve JavaScript redirect issue on Python Scrapy?

I am fairly new to scrapy and following docs to scrape info on https://pbejobbers.com/abrasives
using my script:
import scrapy
class CrwSpider(scrapy.Spider):
name = "Otim"
def start_requests(self):
urls = [
'https://pbejobbers.com/abrasives'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pattern = r'document\.location\.href=\"(:?.*)=1\";'
url = response.css('script::text').re_first(pattern)
print (url)
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
next_page = url+'=1'
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
When I run the crawler, instead of getting actual page, I am getting this javascript:
<html>
<body>
<script type="text/javascript" src="/aes.min.js"></script>
<script>
function toNumbers(d) {
var e = [];
d.replace(/(..)/g, function(d) {
e.push(parseInt(d, 16));
});
return e;
}
function toHex() {
for (
var d = [],
d =
1 == arguments.length && arguments[0].constructor == Array
? arguments[0]
: arguments,
e = "",
f = 0;
f < d.length;
f++
)
e += (16 > d[f] ? "0" : "") + d[f].toString(16);
return e.toLowerCase();
}
var a = toNumbers("de50860916c188904e9c359aaaf4f248"),
b = toNumbers("a944ac1efe048739325d92e58868ffa1"),
c = toNumbers("34d8ed644eb63ddaafeb01607ce6b6ce");
document.cookie =
"OCXS=" +
toHex(slowAES.decrypt(c, 2, a, b)) +
"; expires=Thu, 31-Dec-37 23:55:55 GMT; path=/";
document.location.href =
"http://pbejobbers.com/abrasives?81e93addddb02a10cd0652f09370ae96=2";
</script>
</body>
</html>
The actual page consist of list of product cards. I noticed that document.location.href has a new url. So I grabbed it using regular expression and tried to feed it back to crawler as new URL to parse but then I get same result with =2ending URL. I cant seem to get to actual page.
How can I get the redirect URL and use it? I am really stuck.
You cannot go to next page using Scrapy. As when you disable JavaScript from inspection tools, Next button stops working. Scrapy cannot render JavaScript. You have to use Selenium to scrape this page.

Getting javascript variable value while scraping with python

I know this is asked before also, but I am a newbie in scraping and python. Please help me and it would be very much helpful in my learning path.
I am scraping a news site using python with packages such as Beautiful Soup and etc.
I am facing difficulty while getting the value of java script variable which is declared in script tag and also it is getting updated there.
Here is the part of HTML page which I am scraping:(containing only script part)
<!-- Eliminate render-blocking JavaScript and CSS in above-the-fold content -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.8/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.0/js/materialize.min.js"></script>
<script type="text/javascript" src="/dist/scripts/index.js"></script>
<script type="text/javascript" src="/dist/scripts/read.js"></script>
<script src="/dist/scripts/jquery.scrolldepth.min.js"></script>
<script type="text/javascript">
var min_news_id = "d7zlgjdu-1"; // line 1
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'politics','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id; // line 2
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
</script>
From the above part, I want to get the value of min_news_id in python.
I should also get the value of same variable if updated from line 2.
Here is how I am doing it:
self.pattern = re.compile('var min_news_id = (.+?);') // or self.pattern = re.compile('min_news_id = (.+?);')
page = bs(htmlPage, "html.parser")
//find all the scripts tag
scripts = page.find_all("script")
for script in scripts:
for line in script:
scriptString = str(line)
if "min_news_id" in scriptString:
scriptString.replace('"', '\\"')
print(scriptString)
if(self.pattern.match(str(scriptString))):
print("matched")
data = self.pattern.match(scriptString)
jsVariable = json.loads(data.groups()[0])
InShortsScraper.newsOffset = jsVariable
print(InShortsScraper.newsOffset)
But I am never getting the value of the variable. Is it problem with my regular expression or any other? Please help me.
Thank You in advance.
html = '''<!-- Eliminate render-blocking JavaScript and CSS in above-the-fold content -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.8/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.0/js/materialize.min.js"></script>
<script type="text/javascript" src="/dist/scripts/index.js"></script>
<script type="text/javascript" src="/dist/scripts/read.js"></script>
<script src="/dist/scripts/jquery.scrolldepth.min.js"></script>
<script type="text/javascript">
var min_news_id = "d7zlgjdu-1"; // line 1
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'politics','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id; // line 2
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
</script>'''
finder = re.findall(r'min_news_id = .*;', html)
print(finder)
Output:
['min_news_id = "d7zlgjdu-1";', 'min_news_id = data.min_news_id||min_news_id;']
#2 OR YOU CAN USE
print(finder[0].replace('min_news_id = ', '').replace('"','').replace(';','').strip())
Output:
d7zlgjdu-1
#3 OR YOU CAN USE
finder = re.findall(r'[a-z0-9]{8}-[0-9]', html)
print(finder)
Output:
['d7zlgjdu-1']
you can't monitor javascript variable change using BeautifulSoup, here how to get next page news using while loop, re and json
from bs4 import BeautifulSoup
import requests, re
page_url = 'https://inshorts.com/en/read/politics'
ajax_url = 'https://inshorts.com/en/ajax/more_news'
htmlPage = requests.get(page_url).text
# BeautifulSoup extract article summary
# page = BeautifulSoup(htmlPage, "html.parser")
# ...
# get current min_news_id
min_news_id = re.search('min_news_id\s+=\s+"([^"]+)', htmlPage).group(1) # result: d7zlgjdu-1
customHead = {'X-Requested-With': 'XMLHttpRequest', 'Referer': page_url}
while min_news_id:
# change "politics" if in different category
reqBody = {'category' : 'politics', 'news_offset' : min_news_id }
# get Ajax next page
ajax_response = requests.post(ajax_url, headers=customHead, data=reqBody).json() # parse string to json
# again, do extract article summary
page = BeautifulSoup(ajax_response["html"], "html.parser")
# ....
# ....
# new min_news_id
min_news_id = ajax_response["min_news_id"]
# remove this to loop all page (thousand?)
break
thank you for the response, Finally I solved using requests package after reading its documentation,
here is my code :
if InShortsScraper.firstLoad == True:
self.pattern = re.compile('var min_news_id = (.+?);')
else:
self.pattern = re.compile('min_news_id = (.+?);')
page = None
# print("Pattern: " + str(self.pattern))
if news_offset == None:
htmlPage = urlopen(url)
page = bs(htmlPage, "html.parser")
else:
self.loadMore['news_offset'] = InShortsScraper.newsOffset
# print("payload : " + str(self.loadMore))
try:
r = myRequest.post(
url = url,
data = self.loadMore
)
except TypeError:
print("Error in loading")
InShortsScraper.newsOffset = r.json()["min_news_id"]
page = bs(r.json()["html"], "html.parser")
#print(page)
if InShortsScraper.newsOffset == None:
scripts = page.find_all("script")
for script in scripts:
for line in script:
scriptString = str(line)
if "min_news_id" in scriptString:
finder = re.findall(self.pattern, scriptString)
InShortsScraper.newsOffset = finder[0].replace('min_news_id = ', '').replace('"','').replace(';','').strip()

Categories

Resources