I'm trying to get a JavaScript var value from an HTML source code using BeautifulSoup.
For example I have:
<script>
[other code]
var my = 'hello';
var name = 'hi';
var is = 'halo';
[other code]
</script>
I want something to return the value of the var "my" in Python
How can I achieve that?
The simplest approach is to use a regular expression pattern to both locate the element via BeautifulSoup and extract the desired substring:
import re
from bs4 import BeautifulSoup
data = """
<script>
[other code]
var my = 'hello';
var name = 'hi';
var is = 'halo';
[other code]
</script>
"""
soup = BeautifulSoup(data, "html.parser")
pattern = re.compile(r"var my = '(.*?)';$", re.MULTILINE | re.DOTALL)
script = soup.find("script", text=pattern)
print(pattern.search(script.text).group(1))
Prints hello.
Another idea would be to use a JavaScript parser and locate a variable declaration node, check the identifier to be of a desired value and extract the initializer. Example using slimit parser:
from bs4 import BeautifulSoup
from slimit import ast
from slimit.parser import Parser
from slimit.visitors import nodevisitor
data = """
<script>
var my = 'hello';
var name = 'hi';
var is = 'halo';
</script>
"""
soup = BeautifulSoup(data, "html.parser")
script = soup.find("script", text=lambda text: text and "var my" in text)
# parse js
parser = Parser()
tree = parser.parse(script.text)
for node in nodevisitor.visit(tree):
if isinstance(node, ast.VarDecl) and node.identifier.value == 'my':
print(node.initializer.value)
Prints hello.
the answer, pattern = re.compile(r"var my = '(.*?)';$", re.MULTILINE | re.DOTALL)
should get a wrong way, have to remove the line-end sign $ when set re.MULTILINE re.DOTALL at same time.
try with python 3.6.4
Building on #alecxe's answer, but considering a more complex case of an array of dictionaries - or an array of flat json objects:
from bs4 import BeautifulSoup
from slimit import ast
from slimit.parser import Parser
from slimit.visitors import nodevisitor
data = """
<script>
var my = [{'dic1key1':1}, {'dic2key1':1}];
var name = 'hi';
var is = 'halo';
</script>
"""
soup = BeautifulSoup(data, "html.parser")
script = soup.find("script", text=lambda text: text and "var my" in text)
# parse js
parser = Parser()
tree = parser.parse(script.text)
array_items = []
for node in nodevisitor.visit(tree):
if isinstance(node, ast.VarDecl) and node.identifier.value == 'my':
for item in node.initializer.items:
parsed_dict = {getattr(n.left, 'value', '')[1:-1]: getattr(n.right, 'value', '')[1:-1]
for n in nodevisitor.visit(item)
if isinstance(n, slimit.ast.Assign)}
array_items.append(parsed_dict)
print(array_items)
Related
I want to Extract data from a variable which is inside of a script:
<script>
var Itemlist = 'null';
var ItemData = '[{\"item_id\":\"107\",\"id\":\"79\",\"line_item_no\":\"1\",\"Amount\":\"99999.00\"}]';
</script>
I want the item_id and the Amount inside of a variable in python
I tried using regex it worked for a while but when the cookies session updated it stopped working
Is there any other way to get those values??
I am using this method to get the script from the html but it changes when the cookie session updates
soup = bs(response.content, 'html.parser')
script = soup.find('script')[8]
so i have to change the number that i've put after ('script') for now it's [8] if cookies session updates i have to keep changing the number until i find the script i am looking for
To get the data from the <script> you can use this example:
import re
import json
from bs4 import BeautifulSoup
html_data = """
<script>
var Itemlist = 'null';
var ItemData = '[{\"item_id\":\"107\",\"id\":\"79\",\"line_item_no\":\"1\",\"Amount\":\"99999.00\"}]';
</script>
"""
soup = BeautifulSoup(html_data, "html.parser")
data = soup.select_one("script").text
data = re.search(r"ItemData = '(.*)';", data).group(1)
data = json.loads(data)
print("Item_id =", data[0]["item_id"], "Amount =", data[0]["Amount"])
Prints:
Item_id = 107 Amount = 99999.00
i am currently using
import requests
from bs4 import BeautifulSoup
source = requests.get('www.randomwebsite.com').text
soup = BeautifulSoup(source,'lxml')
details= soup.find('script')
this is returning me the following script.
<script>
var Url = "https://www.example.com";
if(Url != ''){code}
else {code
}
</script>
i want to have the output as following.
https://www.example.com
import re
text = """
<script>
var Url = "https://www.example.com";
if(Url != ''){code}
else {code
}
</script>
"""
match = re.search('Url = "(.*?)"', text)
print(match.group(1))
Output:
https://www.example.com
To print the cashback_url, you can try this script:
import re
import requests
url = 'https://tracking.earnkaro.com/visitretailer/508?id=103894&shareid=ENKR2020090345700421&dl=https%3A%2F%2Fwww.amazon.in%2Fgp%2Fproduct%2FB08645RXJ6%2Fref%3Dox_sc_act_title_1%3Fsmid%3DAT95IG9ONZD7S%26psc%3D1'
html_data = requests.get(url).text
cashback_url = re.search(r'var cashbackUrl = "(.*?)"', html_data).group(1)
print(cashback_url)
Prints:
https://www.amazon.in/gp/product/B08645RXJ6/ref=ox_sc_act_title_1?smid=AT95IG9ONZD7S&psc=1&ck&tag=EK003221-21
I am trying to build a download manager script with python, The web page contains some script tags, i want to isolate a particular script, the script html5player.setVideoUrlHigh('https://*****');,
I don't know how to go about it, I was able to get all the script tags but i am unable to get the script tag with this code html5player.setVideoUrlHigh('https://*****');
Here is my python code
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
Url = '*****'
pg = urlopen(Url)
sp = BeautifulSoup(pg)
script_tag = sp.find_all('script')
# print(script_tag[1])
print(re.search("setVideoHLS\(\'(.*?)\'\)", script_tag).group(1))
The script tag i want to get is this:
<script>
logged_user = false;
var static_id_cdn = 17;
var html5player = new HTML5Player('html5video', '56420147');
if (html5player) {
html5player.setVideoTitle('passionate hotel room');
html5player.setSponsors(false);
html5player.setVideoUrlLow('https://*****');
html5player.setVideoUrlHigh('https://******');
html5player.setVideoHLS('https://****');
html5player.setThumbUrl('https://**');
html5player.setStaticDomain('***');
html5player.setHttps();
html5player.setCanUseHttps();
document.getElementById('html5video').style.minHeight = '';
html5player.initPlayer();
}
How can I get parameter from this function `html5player.setVideoUrlHigh('https://******').
You can get the script tag using this code,
import re
from bs4 import BeautifulSoup
html = """<script> logged_user = false;
var static_id_cdn = 17;
var html5player = new HTML5Player('html5video', '56420147');
if (html5player) {
html5player.setVideoTitle('passionate hotel room');
html5player.setSponsors(false);
html5player.setVideoUrlLow('https://*****');
html5player.setVideoUrlHigh('https://******');
html5player.setVideoHLS(''https://****');
html5player.setThumbUrl('https://**');
html5player.setStaticDomain('***');
html5player.setHttps();
html5player.setCanUseHttps();
document.getElementById('html5video').style.minHeight = '';
html5player.initPlayer();
}</script>"""
soup = BeautifulSoup(HTML)
txt = soup.script.get_text()
print(txt)
Output:
logged_user = false;
var static_id_cdn = 17;
var html5player = new HTML5Player('html5video', '56420147');
if (html5player) {
html5player.setVideoTitle('passionate hotel room');
html5player.setSponsors(false);
html5player.setVideoUrlLow('https://*****');
html5player.setVideoUrlHigh('https://******');
html5player.setVideoHLS(''https://****');
html5player.setThumbUrl('https://**');
html5player.setStaticDomain('***');
html5player.setHttps();
html5player.setCanUseHttps();
document.getElementById('html5video').style.minHeight = '';
html5player.initPlayer();
}
EDIT
import requests
import bs4
import re
url = 'url'
r = requests.get(url)
bs = bs4.BeautifulSoup(r.text, "html.parser")
scripts = bs.find_all('script')
src = scripts[7] #Needed script is in position 7
print(re.search("html5player.setVideoUrlHigh\(\'(.*?)\'\)", str(src)).group(1))
How can you get the value of the variable ue_mid if you were trying to scrape a web page using BeautifulSoup and also using this function: soup.select_one()?
This is how the list of variables on the source code looks like:
var ue_id = 'XXXXXXXXXXXX',
ue_mid = 'ValueToGet',
ue_navtiming = 1;
Thank you so much in advance! 🙏
It is JavaScript. You can use select_one() only to get text from tag <script> and later you have to use string's functions (or regex) to extract it from string.
html = '''<script>
var ue_id = 'XXXXXXXXXXXX',
ue_mid = 'ValueToGet',
ue_navtiming = 1;
</script>'''
from bs4 import BeautifulSoup as BS
soup = BS(html, 'html.parser')
text = soup.select_one('script').get_text()
text = text.split("ue_mid = '")[1]
text = text.split("',")[0]
print(text)
# ValueToGet
I am trying to extract the content of a single "value" attribute in a specific "input" tag on a webpage. I use the following code:
import urllib
f = urllib.urlopen("http://58.68.130.147")
s = f.read()
f.close()
from BeautifulSoup import BeautifulStoneSoup
soup = BeautifulStoneSoup(s)
inputTag = soup.findAll(attrs={"name" : "stainfo"})
output = inputTag['value']
print str(output)
I get TypeError: list indices must be integers, not str
Even though, from the Beautifulsoup documentation, I understand that strings should not be a problem here... but I am no specialist, and I may have misunderstood.
Any suggestion is greatly appreciated!
.find_all() returns list of all found elements, so:
input_tag = soup.find_all(attrs={"name" : "stainfo"})
input_tag is a list (probably containing only one element). Depending on what you want exactly you either should do:
output = input_tag[0]['value']
or use .find() method which returns only one (first) found element:
input_tag = soup.find(attrs={"name": "stainfo"})
output = input_tag['value']
In Python 3.x, simply use get(attr_name) on your tag object that you get using find_all:
xmlData = None
with open('conf//test1.xml', 'r') as xmlFile:
xmlData = xmlFile.read()
xmlDecoded = xmlData
xmlSoup = BeautifulSoup(xmlData, 'html.parser')
repElemList = xmlSoup.find_all('repeatingelement')
for repElem in repElemList:
print("Processing repElem...")
repElemID = repElem.get('id')
repElemName = repElem.get('name')
print("Attribute id = %s" % repElemID)
print("Attribute name = %s" % repElemName)
against XML file conf//test1.xml that looks like:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<root>
<singleElement>
<subElementX>XYZ</subElementX>
</singleElement>
<repeatingElement id="11" name="Joe"/>
<repeatingElement id="12" name="Mary"/>
</root>
prints:
Processing repElem...
Attribute id = 11
Attribute name = Joe
Processing repElem...
Attribute id = 12
Attribute name = Mary
For me:
<input id="color" value="Blue"/>
This can be fetched by below snippet.
page = requests.get("https://www.abcd.com")
soup = BeautifulSoup(page.content, 'html.parser')
colorName = soup.find(id='color')
print(colorName['value'])
If you want to retrieve multiple values of attributes from the source above, you can use findAll and a list comprehension to get everything you need:
import urllib
f = urllib.urlopen("http://58.68.130.147")
s = f.read()
f.close()
from BeautifulSoup import BeautifulStoneSoup
soup = BeautifulStoneSoup(s)
inputTags = soup.findAll(attrs={"name" : "stainfo"})
### You may be able to do findAll("input", attrs={"name" : "stainfo"})
output = [x["stainfo"] for x in inputTags]
print output
### This will print a list of the values.
I would actually suggest you a time saving way to go with this assuming that you know what kind of tags have those attributes.
suppose say a tag xyz has that attritube named "staininfo"..
full_tag = soup.findAll("xyz")
And i wan't you to understand that full_tag is a list
for each_tag in full_tag:
staininfo_attrb_value = each_tag["staininfo"]
print staininfo_attrb_value
Thus you can get all the attrb values of staininfo for all the tags xyz
you can also use this :
import requests
from bs4 import BeautifulSoup
import csv
url = "http://58.68.130.147/"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
get_details = soup.find_all("input", attrs={"name":"stainfo"})
for val in get_details:
get_val = val["value"]
print(get_val)
You could try to use the new powerful package called requests_html:
from requests_html import HTMLSession
session = HTMLSession()
r = session.get("https://www.bbc.co.uk/news/technology-54448223")
date = r.html.find('time', first = True) # finding a "tag" called "time"
print(date) # you will have: <Element 'time' datetime='2020-10-07T11:41:22.000Z'>
# To get the text inside the "datetime" attribute use:
print(date.attrs['datetime']) # you will get '2020-10-07T11:41:22.000Z'
I am using this with Beautifulsoup 4.8.1 to get the value of all class attributes of certain elements:
from bs4 import BeautifulSoup
html = "<td class='val1'/><td col='1'/><td class='val2' />"
bsoup = BeautifulSoup(html, 'html.parser')
for td in bsoup.find_all('td'):
if td.has_attr('class'):
print(td['class'][0])
Its important to note that the attribute key retrieves a list even when the attribute has only a single value.
Here is an example for how to extract the href attrbiutes of all a tags:
import requests as rq
from bs4 import BeautifulSoup as bs
url = "http://www.cde.ca.gov/ds/sp/ai/"
page = rq.get(url)
html = bs(page.text, 'lxml')
hrefs = html.find_all("a")
all_hrefs = []
for href in hrefs:
# print(href.get("href"))
links = href.get("href")
all_hrefs.append(links)
print(all_hrefs)
You can try gazpacho:
Install it using pip install gazpacho
Get the HTML and make the Soup using:
from gazpacho import get, Soup
soup = Soup(get("http://ip.add.ress.here/")) # get directly returns the html
inputs = soup.find('input', attrs={'name': 'stainfo'}) # Find all the input tags
if inputs:
if type(inputs) is list:
for input in inputs:
print(input.attr.get('value'))
else:
print(inputs.attr.get('value'))
else:
print('No <input> tag found with the attribute name="stainfo")