Python regular expression matching a multiline javascript code - python

I'm having a bit of trouble getting a Python regex to work when matching against text that spans multiple lines. My example is:
function initialize()
{
var myLatlng = new google.maps.LatLng(23.800567,5.942068);
var myOptions =
{
panControl: true,
zoomControl: true,
scaleControl: false,
streetViewControl: true,
zoom: 11,
center: myLatlng,
mapTypeId: google.maps.MapTypeId.HYBRID
}
var map = new google.maps.Map(document.getElementById("map"), myOptions);
var bounds = new google.maps.LatLngBounds();
var locations = [
['<div CLASS="Tekst"><B>tss fsdf<\/B><BR>hopp <BR><\/div>', 54.538665,24.885818, 1, 'text']
,
['<div CLASS="Tekst"><\/div>', 24.465462,24.966919, 1, 'text']
]
What I want to extract is context in locations. As result I want to look like:
- '<div CLASS="Tekst"><B>tss fsdf<\/B><BR>hopp <BR><\/div>',
54.538665,24.885818, 1, 'text'
- '<div CLASS="Tekst"><\/div>', 24.465462,24.966919, 1, 'text'
I try regex like this:
regex = r"var locations =\[\[(.+?)\]\]"
But it doesnt work.

hello you can try this regex
re.findall("(<div.+)\]",toparse)

Related

Trying to make a local file dictionary and search system

I am trying to make a local file searcher, which will search for files based on tags, and also will by names. i dont have any idea on how to make the searching system nor the python dictionary and searching with tags which confuse me.
files = {'samplefile1.txt', 'samplefile2.txt'}
fileName = ''
fileDiscription = 'Enter Discription here'
isMP3File = True
isMP4File = True
isTxtFile = True
isArchived = True
tags = ['sample1', 'sample2', 'favorited']
filesDictionary = {
'samplefile1.txt': {
fileName: 'coolFile1',
fileDiscription: 'cool disc.',
isMP3File: False,
isMP4File: False,
isTxtFile: True,
isArchived: False,
tags = ['sample1', 'favorited']
},
'samplefile1.txt': {
fileName: 'coolFile2',
fileDiscription: 'cool disc2',
isMP3File: False,
isMP4File: False,
isTxtFile: True,
isArchived: True,
tags = ['sample2']
},
}
so in the code above, with search function, it should show only samplefile1.txt when searched by 'sample1', or 'favorited', or samplefile2.txt if searched with 'sample2'
(also fileName is the name i was talking about in this question, not the file name on pc)
(also any idea on how to automate this 'files' dictionary adding using gui (something like how you would post stuff to twitter or smth, with ticks and message boxes))
Create a dictionary where you have each tag as a key, and the filename as a value.
Since you want to search by tag, having the tags as keys will make the search time constant.
searchDict = {
'sample1': ['samplefile1.txt'],
'favorited': ['samplefile1.txt'],
'sample2': ['samplefile2.txt']
}
then given a tag you can just get the filename inmediately
searchDict['sample1'] # will return ['samplefile1.txt']
You can then use that key to access your main dictionary files
for filename in searchDict['sample1']:
print(files[filename])
will print
{
fileName: 'coolFile1',
fileDiscription: 'cool disc.',
isMP3File: False,
isMP4File: False,
isTxtFile: True,
isArchived: False,
tags = ['sample1', 'favorited']
}
To create the searchDict, you can iterate once over your database of files, getting the tags and associating them to the filenames. It will be a costly operation if your database is big, but once done your search will run in constant time.

How to extract link from html script in python?

How can I extract the URL from a script of HTML with Python?
The HTML provided:
function download() {
window.open('https:somelink.com');
}
const text = `<div style=\'position: relative;padding-bottom: 56.25%;height: 0;overflow: hidden;\'>
<iframe allowfullscreen=\'allowfullscreen\' src=\'URL\' style=\'border: 0;height: 100%;left: 0;position: absolute;top: 0;width: 100%;\' ></iframe>
</div>`;
function embed() {
var element = document.getElementById('embed-text');
console.log(element);
element.innerHTML = text
}
Desired output will be:
https://somelink.com
Any help will do. Thanks!
You should use regex like this:
var urlRegex = /(https?:\/\/[^\s]+)/; // the regex
// your string
var input = "<div style=\'position: relative;padding-bottom: 56.25%;height: 0;overflow: hidden;\'><iframe allowfullscreen=\'allowfullscreen\' src=\" https://my-url.com/test \" style=\'border: 0;height: 100%;left: 0;position: absolute;top: 0;width: 100%;\' ></iframe></div>";
console.log(input.match(urlRegex)[1]); // use regex and lot result

Python: How to get full match with RegEx

I'm trying to filter out a link from some java script. The java script part isin't relevant anymore because I transfromed it into a string (text).
Here is the script part:
<script>
setTimeout("location.href = 'https://airdownload.adobe.com/air/win/download/30.0/AdobeAIRInstaller.exe';", 2000);
$(function() {
$("#whats_new_panels").bxSlider({
controls: false,
auto: true,
pause: 15000
});
});
setTimeout(function(){
$("#download_messaging").hide();
$("#next_button").show();
}, 10000);
</script>
Here is what I do:
import re
def get_link_from_text(text):
text = text.replace('\n', '')
text = text.replace('\t', '')
text = re.sub(' +', ' ', text)
search_for = re.compile("href[ ]*=[ ]*'[^;]*")
debug = re.search(search_for, text)
return debug
What I want is the href link and I kind of get it, but for some reason only like this
<_sre.SRE_Match object; span=(30, 112), match="href = 'https://airdownload.adobe.com/air/win/dow>
and not like I want it to be
<_sre.SRE_Match object; span=(30, 112), match="href = 'https://airdownload.adobe.com/air/win/download/30.0/AdobeAIRInstaller.exe'">
So my question is how to get the full link and not only a part of it.
Might the problem be that re.search isin't returning longer strings? Because I tried altering the RegEx, I even tried matching the link 1 by 1, but it still returns only the part I called out earlier.
I've modified it slightly, but for me it returns the complete string you desire now.
import re
text = """
<script>
setTimeout("location.href = 'https://airdownload.adobe.com/air/win/download/30.0/AdobeAIRInstaller.exe';", 2000);
$(function() {
$("#whats_new_panels").bxSlider({
controls: false,
auto: true,
pause: 15000
});
});
setTimeout(function(){
$("#download_messaging").hide();
$("#next_button").show();
}, 10000);
</script>
"""
def get_link_from_text(text):
text = text.replace('\n', '')
text = text.replace('\t', '')
text = re.sub(' +', ' ', text)
search_for = re.compile("href[ ]*=[ ]*'[^;]*")
debug = search_for.findall(text)
print(debug)
get_link_from_text(text)
Output:
["href = 'https://airdownload.adobe.com/air/win/download/30.0/AdobeAIRInstaller.exe'"]

How to extract text from file (with in script tag) using Python or beautifulsoup

Could you please help me with this little thing. I am looking to extract lat and lng value from the below code in SCRIPT tag(not in Body) using Beautiful soup(Python) or python. I am new to Python and blog are recommending to use Beautiful soup for extracting.
I want these two values lat: 21.25335 , lng: 81.649445
I am using regular expression for this . My regular expresion "^l([a-t])(:) ([0-9])([^,]+)"
Check this link for Regular expression and html file -
http://regexr.com/3glde
I get those two value with this regular expression but i want only those lat and lng value (numeric part ) to be stored in variable .
Here below is my python code which I am using
import re
pattern = re.compile("^[l]([a-t])([a-t])(\:) ([0-9])([^,]+)")
for i, line in enumerate(open('C:\hile_text.html')):
for match in re.finditer(pattern, line):
print 'Found on line %s: %s' % (i+1, match.groups())
Output:
Found on line 3218: ('a', 't', ':', '2', '1.244791')
Found on line 3219: ('n', 'g', ':', '8', '1.643486')
I want only those numeric value as output like 21.25335,81.649445 and want to store these values in variables or else you can provide alternate code to this.
plzz soon help me out .Thanks in anticipation.
This is the script tag in html file .
<script type="text/javascript">
window.mapDivId = 'map0Div';
window.map0Div = {
lat: 21.25335,
lng: 81.649445,
zoom: null,
locId: 5897747,
geoId: 297595,
isAttraction: false,
isEatery: true,
isLodging: false,
isNeighborhood: false,
title: "Aman Age Roll & Chicken ",
homeIcon: true,
url: "/Restaurant_Review-g297595-d5897747-Reviews-Aman_Age_Roll_Chicken-Raipur_Raipur_District_Chhattisgarh.html",
minPins: [
['hotel', 20],
['restaurant', 20],
['attraction', 20],
['vacation_rental', 0] ],
units: 'km',
geoMap: false,
tabletFullSite: false,
reuseHoverDivs: false,
noSponsors: true };
ta.store('infobox_js', 'https://static.tacdn.com/js3/infobox-c-v21051733989b.js');
ta.store("ta.maps.apiKey", "");
(function() {
var onload = function() {
if (window.location.hash == "#MAPVIEW") {
ta.run("ta.mapsv2.Factory.handleHashLocation", {}, true);
}
}
if (window.addEventListener) {
if (window.history && window.history.pushState) {
window.addEventListener("popstate", function(e) {
ta.run("ta.mapsv2.Factory.handleHashLocation", {}, false);
}, false);
}
window.addEventListener('load', onload, false);
}
else if (window.attachEvent) {
window.attachEvent('onload', onload);
}
})();
ta.store("mapsv2.show_sidebar", true);
ta.store('mapsv2_restaurant_reservation_js', ["https://static.tacdn.com/js3/ta-mapsv2-restaurant-reservation-c-v2430632369b.js"]);
ta.store('mapsv2.typeahead_css', "https://static.tacdn.com/css2/maps_typeahead-v21940478230b.css");
// Feature gate VR price pins on SRP map. VRC-14803
ta.store('mapsv2.vr_srp_map_price_enabled', true);
ta.store('mapsv2.geoName', 'Raipur');
ta.store('mapsv2.map_addressnotfound', "Address not found"); ta.store('mapsv2.map_addressnotfound3', "We couldn\'t find that location near {0}. Please try another search."); ta.store('mapsv2.directions', "Directions from {0} to {1}"); ta.store('mapsv2.enter_dates', "Enter dates for best prices"); ta.store('mapsv2.best_prices', "Best prices for your stay"); ta.store('mapsv2.list_accom', "List of accommodations"); ta.store('mapsv2.list_hotels', "List of hotels"); ta.store('mapsv2.list_vrs', "List of holiday rentals"); ta.store('mapsv2.more_accom', "More accommodations"); ta.store('mapsv2.more_hotels', "More hotels"); ta.store('mapsv2.more_vrs', "More Holiday Homes"); ta.store('mapsv2.sold_out_on_1', "SOLD OUT on 1 site"); ta.store('mapsv2.sold_out_on_y', "SOLD OUT on 2 sites"); </script>
Your regular expression is a bit messed up.
^l says you are trying to match an 'l' that is the very first character on a line.
^\s+(l[an][gt])(:\s+)(\d+\.\d+) would work better. Checkout an regerx analyzer tool such as http://www.myezapp.com/apps/dev/regexp/show.ws to get a breakdown of what is happening.
Here is a breakdown
Sequence: match all of the followings in order
BeginOfLine
Repeat
WhiteSpaceCharacter
one or more times
CapturingGroup
GroupNumber:1
Sequence: match all of the followings in order
l
AnyCharIn[ a n]
AnyCharIn[ g t]
CapturingGroup
GroupNumber:2
Sequence: match all of the followings in order
:
Repeat
WhiteSpaceCharacter
one or more times
CapturingGroup
GroupNumber:3
Sequence: match all of the followings in order
Repeat
Digit
one or more times
.
Repeat
Digit
one or more times

Retrieving a string using REGEX in Python 2.7.2

I have the following code snippet from page source:
var myPDF = new PDFObject({
url: "http://www.site.com/doc55.pdf",
id: "pdfObjectContainer",
width: "100%",
height: "700px",
pdfOpenParams: {
navpanes: 0,
statusbar: 1,
toolbar: 1,
view: "FitH"
}
}).embed("pdf_placeholder");
the
'PDFObject('
is unique on the page. I want to retreive url content using REGEX. In this case I need to get
http://www.site.com/doc55.pdf
Please help.
Here is an alternative for solving your problem without using regex:
url,in_object = None, False
with open('input') as f:
for line in f:
in_object = in_object or 'PDFObject(' in line
if in_object and 'url:' in line:
url = line.split('"')[1]
break
print url
In order to be able to find "something that happens in the line after something else", you need to match things "including the newline". For this you use the (dotall) modifier - a flag added during the compilation.
Thus the following code works:
import re
r = re.compile(r'(?<=PDFObject).*?url:.*?(http.*?)"', re.DOTALL)
s = '''var myPDF = new PDFObject({
url: "http://www.site.com/doc55.pdf",
id: "pdfObjectContainer",
width: "100%",
height: "700px",
pdfOpenParams: {
navpanes: 0,
statusbar: 1,
toolbar: 1,
view: "FitH"
}
}).embed("pdf_placeholder"); '''
print r.findall(s)
Explanation:
r = re.compile( compile regular expression
r' treat this string as a regular expression
(?<=PDFObject) the match I want happens right after PDFObject
.*? then there may be some other characters...
url: followed by the string url:
.*? then match whatever follows until you get to the first instance (`?` : non-greedy match of
(http:.*?)" match the string http: up to (but not including) the first "
', end of regex string, but there's more...
re.DOTALL) set the DOTALL flag - this means the dot matches all characters
including newlines. This allows the match to continue from one line
to the next in the .*? right after the lookbehind
using a combination of look-behind and look-ahead assertions
import re
re.search(r'(?<=url:).*?(?=",)', s).group().strip('" ')
'http://www.site.com/doc55.pdf'
This works:
import re
src='''\
var myPDF = new PDFObject({
url: "http://www.site.com/doc55.pdf",
URL: "http://www.site.com/doc52.PDF",
id: "pdfObjectContainer",
width: "100%",
height: "700px",
pdfOpenParams: {
navpanes: 0,
statusbar: 1,
toolbar: 1,
view: "FitH"
}
}).embed("pdf_placeholder"); '''
print [m.group(1).strip('"') for m in
re.finditer(r'^url:\s*(.*)[\W]$',
re.search(r'PDFObject\(\{(.*)',src,re.M | re.S | re.I).group(1),re.M|re.I)]
prints:
['http://www.site.com/doc55.pdf', 'http://www.site.com/doc52.PDF']
Regex
new\s+PDFObject\(\{\s*url:\s*"[^"]+"
Demo
Extract url only
If 'PDFObject(' is the unique identifier in the page, you only have to match the first next quoted content.
Using the DOTALL flag (re.DOTALL or re.S) and the non-greedy star (*?), you can write:
import re
snippet = '''
var myPDF = new PDFObject({
url: "http://www.site.com/doc55.pdf",
id: "pdfObjectContainer",
width: "100%",
height: "700px",
pdfOpenParams: {
navpanes: 0,
statusbar: 1,
toolbar: 1,
view: "FitH"
}
}).embed("pdf_placeholder");
'''
# First version using unnamed groups
RE_UNNAMED = re.compile(r'PDFObject\(.*?"(.*?)"', re.S)
# Second version using named groups
RE_NAMED = re.compile(r'PDFObject\(.*?"(?P<url>.*?)"', re.S)
RE_UNNAMED.search(snippet, re.S).group(1)
RE_NAMED.search(snippet, re.S).group('url')
# result for both: 'http://www.site.com/doc55.pdf'
If you don't want to compile your regex because it's used once, simply this syntax:
re.search(r'PDFObject\(.*?"(.*?)"', snippet, re.S).group(1)
re.search(r'PDFObject\(.*?"(?P<url>.*?)"', snippet, re.S).group('url')
Four choices, one should match you need and taste!
Although the other answers may appear to work, most do not take into account that the only unique thing on the page is 'PDFObject('. A much better regular expression would be the following:
PDFObject\({\surl: "(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)",
It takes into account that 'PDFObject(' is unique and contains some basic URL verification.
Below is an example of how this regex could be used in python
>>> import re
>>> strs = """var myPDF = new PDFObject({
... url: "http://www.site.com/doc55.pdf",
... id: "pdfObjectContainer",
... width: "100%",
... height: "700px",
... pdfOpenParams: {
... navpanes: 0,
... statusbar: 1,
... toolbar: 1,
... view: "FitH"
... }
... }).embed("pdf_placeholder");"""
>>> re.search(r'PDFObject\({\surl: "(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)",',strs).group(1)
'http://www.site.com/doc55.pdf'
A pure python (no regex) alternative would be:
>>> unique = 'PDFObject({\nurl: "'
>>> start = strs.find(unique) + len(unique)
>>> end = start + strs[start:].find('"')
>>> strs[start:end]
'http://www.site.com/doc55.pdf'
No regex oneliner:
>>> (lambda u:(lambda s:(lambda e:strs[s:e])(s+strs[s:].find('"')))(strs.find(u)+len(u)))('PDFObject({\nurl: "')
'http://www.site.com/doc55.pdf'

Categories

Resources