I know this is asked before also, but I am a newbie in scraping and python. Please help me and it would be very much helpful in my learning path.
I am scraping a news site using python with packages such as Beautiful Soup and etc.
I am facing difficulty while getting the value of java script variable which is declared in script tag and also it is getting updated there.
Here is the part of HTML page which I am scraping:(containing only script part)
<!-- Eliminate render-blocking JavaScript and CSS in above-the-fold content -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.8/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.0/js/materialize.min.js"></script>
<script type="text/javascript" src="/dist/scripts/index.js"></script>
<script type="text/javascript" src="/dist/scripts/read.js"></script>
<script src="/dist/scripts/jquery.scrolldepth.min.js"></script>
<script type="text/javascript">
var min_news_id = "d7zlgjdu-1"; // line 1
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'politics','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id; // line 2
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
</script>
From the above part, I want to get the value of min_news_id in python.
I should also get the value of same variable if updated from line 2.
Here is how I am doing it:
self.pattern = re.compile('var min_news_id = (.+?);') // or self.pattern = re.compile('min_news_id = (.+?);')
page = bs(htmlPage, "html.parser")
//find all the scripts tag
scripts = page.find_all("script")
for script in scripts:
for line in script:
scriptString = str(line)
if "min_news_id" in scriptString:
scriptString.replace('"', '\\"')
print(scriptString)
if(self.pattern.match(str(scriptString))):
print("matched")
data = self.pattern.match(scriptString)
jsVariable = json.loads(data.groups()[0])
InShortsScraper.newsOffset = jsVariable
print(InShortsScraper.newsOffset)
But I am never getting the value of the variable. Is it problem with my regular expression or any other? Please help me.
Thank You in advance.
html = '''<!-- Eliminate render-blocking JavaScript and CSS in above-the-fold content -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.8/jquery.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/0.97.0/js/materialize.min.js"></script>
<script type="text/javascript" src="/dist/scripts/index.js"></script>
<script type="text/javascript" src="/dist/scripts/read.js"></script>
<script src="/dist/scripts/jquery.scrolldepth.min.js"></script>
<script type="text/javascript">
var min_news_id = "d7zlgjdu-1"; // line 1
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'politics','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id; // line 2
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
</script>'''
finder = re.findall(r'min_news_id = .*;', html)
print(finder)
Output:
['min_news_id = "d7zlgjdu-1";', 'min_news_id = data.min_news_id||min_news_id;']
#2 OR YOU CAN USE
print(finder[0].replace('min_news_id = ', '').replace('"','').replace(';','').strip())
Output:
d7zlgjdu-1
#3 OR YOU CAN USE
finder = re.findall(r'[a-z0-9]{8}-[0-9]', html)
print(finder)
Output:
['d7zlgjdu-1']
you can't monitor javascript variable change using BeautifulSoup, here how to get next page news using while loop, re and json
from bs4 import BeautifulSoup
import requests, re
page_url = 'https://inshorts.com/en/read/politics'
ajax_url = 'https://inshorts.com/en/ajax/more_news'
htmlPage = requests.get(page_url).text
# BeautifulSoup extract article summary
# page = BeautifulSoup(htmlPage, "html.parser")
# ...
# get current min_news_id
min_news_id = re.search('min_news_id\s+=\s+"([^"]+)', htmlPage).group(1) # result: d7zlgjdu-1
customHead = {'X-Requested-With': 'XMLHttpRequest', 'Referer': page_url}
while min_news_id:
# change "politics" if in different category
reqBody = {'category' : 'politics', 'news_offset' : min_news_id }
# get Ajax next page
ajax_response = requests.post(ajax_url, headers=customHead, data=reqBody).json() # parse string to json
# again, do extract article summary
page = BeautifulSoup(ajax_response["html"], "html.parser")
# ....
# ....
# new min_news_id
min_news_id = ajax_response["min_news_id"]
# remove this to loop all page (thousand?)
break
thank you for the response, Finally I solved using requests package after reading its documentation,
here is my code :
if InShortsScraper.firstLoad == True:
self.pattern = re.compile('var min_news_id = (.+?);')
else:
self.pattern = re.compile('min_news_id = (.+?);')
page = None
# print("Pattern: " + str(self.pattern))
if news_offset == None:
htmlPage = urlopen(url)
page = bs(htmlPage, "html.parser")
else:
self.loadMore['news_offset'] = InShortsScraper.newsOffset
# print("payload : " + str(self.loadMore))
try:
r = myRequest.post(
url = url,
data = self.loadMore
)
except TypeError:
print("Error in loading")
InShortsScraper.newsOffset = r.json()["min_news_id"]
page = bs(r.json()["html"], "html.parser")
#print(page)
if InShortsScraper.newsOffset == None:
scripts = page.find_all("script")
for script in scripts:
for line in script:
scriptString = str(line)
if "min_news_id" in scriptString:
finder = re.findall(self.pattern, scriptString)
InShortsScraper.newsOffset = finder[0].replace('min_news_id = ', '').replace('"','').replace(';','').strip()
Related
I want to Extract data from a variable which is inside of a script:
<script>
var Itemlist = 'null';
var ItemData = '[{\"item_id\":\"107\",\"id\":\"79\",\"line_item_no\":\"1\",\"Amount\":\"99999.00\"}]';
</script>
I want the item_id and the Amount inside of a variable in python
I tried using regex it worked for a while but when the cookies session updated it stopped working
Is there any other way to get those values??
I am using this method to get the script from the html but it changes when the cookie session updates
soup = bs(response.content, 'html.parser')
script = soup.find('script')[8]
so i have to change the number that i've put after ('script') for now it's [8] if cookies session updates i have to keep changing the number until i find the script i am looking for
To get the data from the <script> you can use this example:
import re
import json
from bs4 import BeautifulSoup
html_data = """
<script>
var Itemlist = 'null';
var ItemData = '[{\"item_id\":\"107\",\"id\":\"79\",\"line_item_no\":\"1\",\"Amount\":\"99999.00\"}]';
</script>
"""
soup = BeautifulSoup(html_data, "html.parser")
data = soup.select_one("script").text
data = re.search(r"ItemData = '(.*)';", data).group(1)
data = json.loads(data)
print("Item_id =", data[0]["item_id"], "Amount =", data[0]["Amount"])
Prints:
Item_id = 107 Amount = 99999.00
I have the following javascript in the header of pages on my site:
<script type='text/javascript'>
var gaProperty = 'UA-00000000-1';
var disableStr = 'ga-disable-' + gaProperty;
if ( document.cookie.indexOf( disableStr + '=true' ) > -1 ) {
window[disableStr] = true;
}
function gaOptout() {
document.cookie = disableStr + '=true; expires=Thu, 31 Dec 2099 23:59:59 UTC; path=/';
window[disableStr] = true;
}
</script>
Im trying to extract the var gaProperty from each page (i.e UA-00000000-1) in a list of url's in a csv file using python. Im new to python and put together a script from bits of scripts ive seen around but it doesnt work:
from requests_html import HTMLSession
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
import re
list = []
with open('list.csv','r') as csvf: # Open file in read mode
urls = csv.reader(csvf)
for url in urls:
list.append(url) # Add each url to list contents
for url in list:
page = urlopen(url[0]).read()
path = " ".join(url)
soup = BeautifulSoup(page, "lxml")
data = soup.find_all('script', type='text/javascript')
gaid = re.search(r'UA-[0-9]+-[0-9]+', data[0].text)
print(path, gaid)
The incorrect result i get is:
https:www.example.com/contact-us/ None
I need to achieve this desired output for each url:
https:www.example.com/contact-us/ UA-00000000-1
Any idea how to get this working in Python?
I would include the var gaProperty in the pattern, to be more specific, then ensure the capture group is lazily capturing everything between the ' that follow i.e. wrap the gaid value.
import re
html ='''
<script type='text/javascript'>
var gaProperty = 'UA-00000000-1';
var disableStr = 'ga-disable-' + gaProperty;
if ( document.cookie.indexOf( disableStr + '=true' ) > -1 ) {
window[disableStr] = true;
}
function gaOptout() {
document.cookie = disableStr + '=true; expires=Thu, 31 Dec 2099 23:59:59 UTC; path=/';
window[disableStr] = true;
}
</script>'''
gaid = re.search(r"var gaProperty = '(.*?)'", html).group(1)
print(f'https:www.example.com/contact-us/{gaid}')
I'm trying to scrape some data on two websites. I successfully scraped it. I also want to develop an API using this scraped data using Django. But when I try to display the scraped data in JSON format in Django. It only shows an empty list. Below I attached my code snippets.
from django.shortcuts import render
from bs4 import BeautifulSoup
import requests
import re
import json
import time
data = []
def getURL(url):
url = url.replace(' ', '-').lower()
for char in url:
if char in "?.!:;|/[]&()":
url = url.replace(char, '-')
if char == "'" or char == ",":
url = url.replace(char, '')
decodeUrl = re.sub(r'-+', '-', url)
# check whether the URL is up or not
parsedUrl = "http://www.tutorialbar.com/" + decodeUrl + "/"
if requests.head(parsedUrl).status_code == 200:
return parsedUrl
urls = ['https://www.discudemy.com/all', 'https://www.discudemy.com/all/2']
for url in urls:
source = requests.get(url).text
soup = BeautifulSoup(source, 'html5lib')
# print(soup)
for content in soup.find_all('section', class_="card"):
# print(content)
try:
language = content.label.text
header = content.div.a.text
day = content.find('span', class_="category").text
i = content.find('div', class_="image")
img = i.find('amp-img')['src']
image = img.replace('240x135', '750x422')
description = content.find('div', class_="description").text.lstrip()
myURL = getURL(header)
udemyURL = requests.get(myURL).text
udemySoup = BeautifulSoup(udemyURL, 'html5lib')
udemylink = udemySoup.find_all('a', class_="btn_offer_block re_track_btn")[0]["href"]
entry = {
'language': language,
'header': header,
'day': day,
'image': image,
'description': description,
'courselink': udemylink,
}
data.append(entry)
print()
except Exception as e:
continue
print(json.dumps(data))
print()
print(data)
def index(req):
return render(req, 'index.html', {'courses': json.dumps(data)})
Below is my HTML file for displaying JSON data.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>UdemyCourses</title>
</head>
<body>
{{ courses }}
</body>
</html>
There is some delay in scraping data. I think it might be a problem. I don't know how to handle asynchronous programming in python. Is there any way to achieve it? I'm a beginner. Help me out. Thanks in advance
I am fairly new to scrapy and following docs to scrape info on https://pbejobbers.com/abrasives
using my script:
import scrapy
class CrwSpider(scrapy.Spider):
name = "Otim"
def start_requests(self):
urls = [
'https://pbejobbers.com/abrasives'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pattern = r'document\.location\.href=\"(:?.*)=1\";'
url = response.css('script::text').re_first(pattern)
print (url)
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
next_page = url+'=1'
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
When I run the crawler, instead of getting actual page, I am getting this javascript:
<html>
<body>
<script type="text/javascript" src="/aes.min.js"></script>
<script>
function toNumbers(d) {
var e = [];
d.replace(/(..)/g, function(d) {
e.push(parseInt(d, 16));
});
return e;
}
function toHex() {
for (
var d = [],
d =
1 == arguments.length && arguments[0].constructor == Array
? arguments[0]
: arguments,
e = "",
f = 0;
f < d.length;
f++
)
e += (16 > d[f] ? "0" : "") + d[f].toString(16);
return e.toLowerCase();
}
var a = toNumbers("de50860916c188904e9c359aaaf4f248"),
b = toNumbers("a944ac1efe048739325d92e58868ffa1"),
c = toNumbers("34d8ed644eb63ddaafeb01607ce6b6ce");
document.cookie =
"OCXS=" +
toHex(slowAES.decrypt(c, 2, a, b)) +
"; expires=Thu, 31-Dec-37 23:55:55 GMT; path=/";
document.location.href =
"http://pbejobbers.com/abrasives?81e93addddb02a10cd0652f09370ae96=2";
</script>
</body>
</html>
The actual page consist of list of product cards. I noticed that document.location.href has a new url. So I grabbed it using regular expression and tried to feed it back to crawler as new URL to parse but then I get same result with =2ending URL. I cant seem to get to actual page.
How can I get the redirect URL and use it? I am really stuck.
You cannot go to next page using Scrapy. As when you disable JavaScript from inspection tools, Next button stops working. Scrapy cannot render JavaScript. You have to use Selenium to scrape this page.
I need a way to get informations from a web page. That info is stored in <script> tag and i can't find a way to extract it. Here is the last iteration of the code i used.
for link in urls:
driver.get(link)
#print(driver.title)
content = driver.page_source
soup = BeautifulSoup(content, features="html.parser")
for a in soup.findAll(string=['script', "EM.", "productFullPrice"]):
print (a)
name=a.find(string=['EM.sef_name'])
print(name);
print(a) and print(name) return nothing.
The source code i want to scrape looks like this:
<script type="text/javascript">
var EM = EM || {};
EM.CDN = 'link1';
EM.something = value;
If you're wanting the text inside the tag you can't just pass 'EM' to the string tag because it is looking for a string that exactly matches 'EM'. That also means it won't match the script tag either and will only look for the string script inside the tag itself. To get the node you need to pass script as a node to the findAll function. If you look at the text value of what's between the script tag it looks like this "\n var EM = EM || {};\n EM.CDN = 'link1';\n EM.something = value; \n ". So it won't find EM because EM isn't equal to that string I posted above. There are a couple ways you can go about this here is one I chose to help return values similar to what you posted:
import bs4
html_string = '''
<script type="text/javascript">
var EM = EM || {};
EM.CDN = 'link1';
EM.something = value;
</script>
'''
wanted_strings= string=["EM.", "productFullPrice"]
soup = bs4.BeautifulSoup(html_string, features="html.parser")
wanted=[]
test = soup.findAll('script')
for word in wanted_strings:
for tag in test:
if word in tag.text:
wanted.append(tag)
wanted
which will then give you the script lines in a list like this with the tags that contain the strings you need
[<script type="text/javascript">
var EM = EM || {};
EM.CDN = 'link1';
EM.something = value;
</script>]
Another way to do this is just look for the tag and then place each line of code in a list
import bs4
html_string = '''
<script type="text/javascript">
var EM = EM || {};
EM.CDN = 'link1';
EM.something = value;
</script>
'''
soup = bs4.BeautifulSoup(html_string, features="html.parser")
test = soup.findAll('script')
blah = [x.strip() for x in test[0].text.split('\n') if x.strip()]
blah
which gives you something like this that may be easier to search for what you need depending on your use case
['var EM = EM || {};', "EM.CDN = 'link1';", 'EM.something = value;']