RegEX doesn't seem to match, even though it should - python

I'm facing trouble trying to match my string W0mMhUcRRnG8dcghE4qvk3JA9lGt8nDl with my RegEX ^([A-Za-z0-9]{32})$.
According to various online RegEx tools it should match, but not according to my Python script:
pattern = re.compile("^([A-Za-z0-9]{32})$")
print(line)
if pattern.match(line):
return line
else:
return None
I've attempted using strip() to check if there are any unseen whitespaces, but can't find anything.
Here is the entire script:
import requests, binascii, base64, re
from requests.auth import HTTPBasicAuth
def pattern_lookup(line):
"""
Will iterate through lines
to find a matching string that
is 32 characters long and only
holds alphanumerical characters.
-----
:param lines: The lines to be iterated.
:return: The line holding the matched string,
or None if not found
"""
pattern = re.compile("^([A-Za-z0-9]{32})$")
print(line)
if pattern.match(line):
return line
else:
return None
def get_secret(host, credentials):
"""
Grabs the hint(flag) from the
host by splitting the response on
semicolon (;) then performing
pattern matching using regex.
----
:param host: The host we are sending
requests to.
:param credentials: The credentials required
to sign into the host.
:return: The hex encoded secret.
"""
try:
response = requests.get(host, auth=HTTPBasicAuth(*credentials))
response_lines = response.content.decode('ascii').replace('"', '').split(';')
return next((line
for line in response_lines
if pattern_lookup(line)),
None)
except requests.RequestException as e:
print(e)
def prepare_payload(secret):
decoded_secret = base64.b64decode(binascii.unhexlify(secret)[::-1])
payload = {'secret': decoded_secret, 'submit': 'placeholder'}
return payload
def get_password(host, credentials, secret):
"""
Uses a post-request injected with the
reverse engineered secret to get access
to the password to natas9.
:param host: The host that holds the
password.
:param credentials:
:param decoded_hint:
:return: The password to Natas9
"""
payload = prepare_payload(secret)
try:
response = requests.post(host, auth=HTTPBasicAuth(*credentials), data=payload)
response_lines = response.content.decode('utf-8').split(' ')
return next((line
for line in response_lines
if pattern_lookup(line.strip())),
None)
except requests.RequestException as e:
print(e)
def main():
host = 'http://natas8.natas.labs.overthewire.org/index-source.html'
credentials = ['natas8', 'DBfUBfqQG69KvJvJ1iAbMoIpwSNQ9bWe']
secret = get_secret(host, credentials)
print(get_password(host.split('index')[0], credentials, secret))
if __name__ == '__main__':
main()
EDIT:
I should mention that the initial test in get_secret works absolutely flawlessly and all my previous modules that use this work fine...
EDIT2:
Output:
<link
rel="stylesheet"
type="text/css"
href="http://natas.labs.overthewire.org/css/level.css">
<link
rel="stylesheet"
href="http://natas.labs.overthewire.org/css/jquery-ui.css"
/>
<link
rel="stylesheet"
href="http://natas.labs.overthewire.org/css/wechall.css"
/>
<script
src="http://natas.labs.overthewire.org/js/jquery-1.9.1.js"></script>
<script
src="http://natas.labs.overthewire.org/js/jquery-ui.js"></script>
<script
src=http://natas.labs.overthewire.org/js/wechall-data.js></script><script
src="http://natas.labs.overthewire.org/js/wechall.js"></script>
<script>var
wechallinfo
=
{
"level":
"natas8",
"pass":
"DBfUBfqQG69KvJvJ1iAbMoIpwSNQ9bWe"
};</script></head>
<body>
<h1>natas8</h1>
<div
id="content">
Access
granted.
The
password
for
natas9
is
W0mMhUcRRnG8dcghE4qvk3JA9lGt8nDl <-- here it is
<form
method=post>
Input
secret:
<input
name=secret><br>
<input
type=submit
name=submit>
</form>
<div
id="viewsource"><a
href="index-source.html">View
sourcecode</a></div>
</div>
</body>
</html>
None

I made a demo code based on your regex, it works fine.
import re
line = 'W0mMhUcRRnG8dcghE4qvk3JA9lGt8nDl'
pattern = re.compile("^([A-Za-z0-9]{32})$")
print(line)
if pattern.match(line):
print ("matched")
else:
print ("No")
Demo
That means the line which you are reading from response_lines is not of the same format which regex is expecting. Try to print the line and see what's missing.
Edit: After your edit, I can you see have multiline data. Use the below:
pattern = re.compile("^([A-Za-z0-9]{32})$", re.MULTILINE)
if pattern.finditer(line):
print ("matched")
else:
print ("No")
Full Demo

Your text is multiline. Have you tried with:
re.compile("^([A-Za-z0-9]{32})$", re.MULTILINE)

Related

Issue in Lambda execution time

I am working on a project where I have to scrape maximum URLs (placed in an S3 bucket's file) in a limited time and store them in searchable database. Right now I am having an issue while scraping web pages inside aws lambda. I have a function for my task which when runs in a google Collab environment takes only 7-8 seconds to execute and produce the desired results. But the same function when deployed as lambda is taking almost 10X more time to execute. Here is my code:
import requests
import re
import validators
import boto3
from smart_open import open
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
nltk.data.path.append("/tmp")
nltk.download("stopwords", download_dir = "/tmp")
def CrawlingLambda(event, context):
"""
This lambda crawls a list of webpages, reading URLS from S3 bucket and returns a dictionary
pairing each URL with its keywords.
Args:
http: A pckage inside PoolManager() able to send GET requests
web_url: url of the website whose availability is required
Returns:
bool: Depending upon the response of GET request, this function will return a bool indicating availability of web_url
"""
results = {}
client = boto3.client('s3')
for line in open('s3://urls-to-monitor/URLs1T.txt', transport_params={'client': client}):
if line[len(line)-1] != '/':
url = line[:len(line)-2]
else: url = line
if validation(url) == False:
continue
try:
web_content = scrape_web(url)
results[url] = web_content
except:
continue
return results
def validation(url):
"""
Validates the URL's string. This method use regular expressions for validation at backend.
Args:
url: URL to validate
Returns:
bool: True if the passes string is a valid URL and False otherwise.
"""
return validators.url(url)
def scrape_web(url):
"""
This function scrapes a given URL's web page for a specific set of keywords.
Args:
url: Page's URL to be scraped
Return:
filtered_words: A refined list of extracted words from the web page.
"""
try:
res = requests.get(url, timeout=2)
except:
raise ValueError
if res.status_code != 200:
raise ValueError
html_page = res.content
soup = remove_tags(html_page)
content = soup.get_text()
words = re.split(r"\s+|/", content.lower())
filtered_words = clean_wordlist(words)
return tuple(filtered_words)
def remove_tags(html):
"""
Remove the specified tags from HTML response recieved from request.get() method.
Args:
html: HTML response of the web page
Returns:
soup: Parsed response of HTML
"""
# parse html content
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script', 'noscript']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return soup
def clean_wordlist(wordlist):
"""
This function removes any punctuation marks and stop words from our extracted wordlist.
Args:
wordlist: A list of raw words extracted from html response of web page.
Returns:
key_words: A filtered list of words containing only key words
"""
words_without_symbol = []
for word in wordlist:
#Symbols to ignore
symbols = "!##$%^&*()_-+={[}]|\;:\"<>?/., "
for i in range(len(symbols)):
word = word.replace(symbols[i], '')
if len(word) > 0:
words_without_symbol.append(word)
#ignoring the stopwords
key_words = [word for word in words_without_symbol if not word in stopwords.words()]
return key_words
Any directions, that why there is much time difference and how can I reduce it.
The only thing that you can configure to affect performance is memory allocation. Try increasing the memory allocated for your function, until you have at least the same performance as with Collab.
Billing shouldn't affected much, as it is calculated as the product of memory and execution time.

How can you print the scraped values in a paragraph format? Django

One quick question. I'm building a scraper that outputs emails. For now the emails get printed in a CSV file. But I want the emails to be outputted in a paragraph format on another URL. I've tried doing some things but it doesn't work out. Here is the code:
views.py
from django.shortcuts import render
from django.shortcuts import render
from . scraper import EmailCrawler
def index(request):
return render(request, 'leadfinderapp/scrape.html')
def scrape(request):
url = request.GET.get('Email')
crawl = EmailCrawler(url)
crawl.crawl()
return render(request, 'leadfinderapp/results.html')
Here is the html fie where I'm trying to output the emails (ignore the code) (results.html):
{% load static %}
<html>
{% for email in scrape %}
<p>{{ result }}</p>
{% endfor %}
</html>
email crawler (scraper.py):
import re
import requests
import requests.exceptions
from urllib.parse import urlsplit, urljoin
from lxml import html
import sys
import csv
class EmailCrawler:
processed_urls = set()
unprocessed_urls = set()
emails = set()
def __init__(self, website: str):
# processed_urls = set()
# unprocessed_urls = set()
# emails = set()
emails = set()
self.website = website
self.email = emails
self.unprocessed_urls.add(website)
self.headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/78.0.3904.70 Chrome/78.0.3904.70 Safari/537.36',
}
self.base_url = urlsplit(self.website).netloc
self.outputfile = self.base_url.replace('.','_')+'.csv'
# we will use this list to skip urls that contain one of these extension. This will save us a lot of bandwidth and speedup the crawling process
# for example: www.example.com/image.png --> this url is useless for us. we cannot possibly parse email from images and all other types of files.
self.garbage_extensions = ['.aif','.cda','.mid','.midi','.mp3','.mpa','.ogg','.wav','.wma','.wpl','.7z','.arj','.deb','.pkg','.rar','.rpm','.tar.gz','.z','.zip','.bin','.dmg','.iso','.toast','.vcd','.csv','.dat','.db','.dbf','.log','.mdb','.sav','.sql','.tar','.apk','.bat','.bin','.cgi','.pl','.exe','.gadget','.jar','.py','.wsf','.fnt','.fon','.otf','.ttf','.ai','.bmp','.gif','.ico','.jpeg','.jpg','.png','.ps','.psd','.svg','.tif','.tiff','.asp','.cer','.cfm','.cgi','.pl','.part','.py','.rss','.key','.odp','.pps','.ppt','.pptx','.c','.class','.cpp','.cs','.h','.java','.sh','.swift','.vb','.ods','.xlr','.xls','.xlsx','.bak','.cab','.cfg','.cpl','.cur','.dll','.dmp','.drv','.icns','.ico','.ini','.lnk','.msi','.sys','.tmp','.3g2','.3gp','.avi','.flv','.h264','.m4v','.mkv','.mov','.mp4','.mpg','.mpeg','.rm','.swf','.vob','.wmv','.doc','.docx','.odt','.pdf','.rtf','.tex','.txt','.wks','.wps','.wpd']
self.email_count = 0
def crawl(self):
"""
It will continue crawling untill the list unprocessed urls list is empty
"""
url = self.unprocessed_urls.pop()
print("CRAWL : {}".format(url))
self.parse_url(url)
if len(self.unprocessed_urls)!=0:
self.crawl()
else:
print('End of crawling for {} '.format(self.website))
print('Total urls visited {}'.format(len(self.processed_urls)))
print('Total Emails found {}'.format(self.emails))
def parse_url(self, current_url: str):
"""
It will load and parse a given url. Loads it and finds all the url in this page.
It also filters the urls and adds them to unprocessed url list.
Finally it scrapes the emails if found on the page and the updates the email list
INPUT:
current_url: URL to parse
RETURN:
None
"""
#we will retry to visit a url for 5 times in case it fails. after that we will skip it in case if it still fails to load
response = requests.get(current_url, headers=self.headers)
tree = html.fromstring(response.content)
urls = tree.xpath('//a/#href') # getting all urls in the page
#Here we will make sure that we convert the sub domain to full urls
# example --> /about.html--> https://www.website.com/about.html
urls = [urljoin(self.website,url) for url in urls]
# now lets make sure that we only include the urls that fall under our domain i.e filtering urls that point outside our main website.
urls = [url for url in urls if self.base_url == urlsplit(url).netloc]
#removing duplicates
urls = list(set(urls))
#filtering urls that point to files such as images, videos and other as listed on garbage_extensions
#Here will loop through all the urls and skip them if they contain one of the extension
parsed_url = []
for url in urls:
skip = False
for extension in self.garbage_extensions:
if not url.endswith(extension) and not url.endswith(extension+'/'):
pass
else:
skip = True
break
if not skip:
parsed_url.append(url)
# finally filtering urls that are already in queue or already visited
for url in parsed_url:
if url not in self.processed_urls and url not in self.unprocessed_urls:
self.unprocessed_urls.add(url)
#parsing email
self.parse_emails(response.text)
# adding the current url to processed list
self.processed_urls.add(current_url)
def parse_emails(self, text: str):
"""
It scans the given texts to find email address and then writes them to csv
Input:
text: text to parse emails from
Returns:
bool: True or false (True if email was found on page)
"""
# parsing emails and then saving to csv
emails = set(re.findall(r'[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', text, re.I))
#TODO: sometime "gFJS3amhZEg_z39D5EErVg#2x.png" gets accepted as email with the above regex. so for now i will check if email ends with jpeg,png and jpg
for email in emails:
skip_email = False
for checker in ['jpg','jpeg','png']:
if email.endswith(checker):
skip_email = True
break
if not skip_email:
if email not in self.emails:
print(email)
self.email_count +=1
self.emails.add(email)
if len(emails)!=0:
return True
else:
return False
#try:
# website = sys.argv[1]
#except:
# website = input("Please enter a website to crawl for emails:")
#crawl = EmailCrawler(website)
#crawl.crawl()
You should include the results as context in the render method!
What does your crawl() method return? If it returns a list of emails you should do something like:
# ...
email_list = crawl.crawl()
return render(request, 'leadfinderapp/results.html', {"email_list": email_list})
Then in the template you can loop over them:
{% for email in email_list %}
<p> {{ email }} </p>
{% endfor %}
EDIT
The crawl method does not return anything. Therefore my suggestion above (which assumes a list is returned) will not work...
I see that that method is called recursively, so, what I suggest is you update a class-level variable at each iteration:
define email_list in your init (or you can use you current email set perhaps?!)
append the scraped emails to that variable from within the crawl method at each iteration
then in your view:
def scrape(request):
url = request.GET.get('Email')
crawl = EmailCrawler(url)
crawl.crawl()
email_list = crawl.email_list # or whatever class-level variable you use
return render(request, 'leadfinderapp/results.html', {"email_list": email_list)

Self processing page with sql and python

Im creating a shop page as my college project. I have a database set up and Im trying to use the select and submit to open a new page which will display information about the selected part. I just cant seem to be able to extract data from the database, can you help please ?
That's a part of my html
<p class="heading2">List of CPU's:</p>
<form action="show_part.py">
<select name="component">
<option value="Intel_i7-5960x">Intel i7-5960x</option>
Then this is the python
#!/usr/local/bin/python3
from cgitb import enable
import pymysql as db
from cgi import FieldStorage
print('Content-Type: text/html')
print()
form_data = FieldStorage()
component = form_data.getfirst("name")
result=''
try:
connection = db.connect('***', '***', '***', '***')
cursor = connection.cursor(db.cursors.DictCursor)
result += cursor.execute("""SELECT price
FROM components
WHERE name = %s""" % (component))
cursor.close()
connection.close()
except db.Error:
result = '<p>Sorry! We are experiencing problems at the moment. Please call back later.</p>'
print("""
<!DOCTYPE html>
<html lang="en">
<head>
<title>Part details</title>
</head>
<body>
%s
</body>
</html>""" % (result))
What happens is I keep getting the error message in the new page, meaning the code is fine, just something about the database query. Any suggestions ?
I think I cracked it, problem was in the sql query, where the program tried to execute:
SELECT price
FROM components
WHERE name=intel_i7-5960x
what is missing is the quotes:
SELECT price
FROM components
WHERE name='intel_i7-5960x'
so I changed that in my html file and it gets me not exactly where I want to go, but its a start !

How to pass python variable to html variable?

I need to read a url link from text file in python as a variable, and use it in html.
The text file "file.txt" contains only one line "http://188.xxx.xxx.xx:8878", this line should be saved in the variable "link", then I should use the contain of this variable in the html, so that the link should be opened when I click on the button image "go_online.png". I tried to change my code as following but it doesn't work! any help please?
#!/usr/bin/python
import cherrypy
import os.path
from auth import AuthController, require, member_of, name_is
class Server(object):
_cp_config = {
'tools.sessions.on': True,
'tools.auth.on': True
}
auth = AuthController()
#cherrypy.expose
#require()
def index(self):
f = open ("file.txt","r")
link = f.read()
print link
f.close()
html = """
<html>
<script language="javascript" type="text/javascript">
var var_link = '{{ link }}';
</script>
<body>
<p>{htmlText}
<p>
</body>
</html>
"""
myText = ''
myText = "Hellow World"
return html.format(htmlText=myText)
index.exposed = True
#configuration
conf = {
'global' : {
'server.socket_host': '0.0.0.0', #0.0.0.0 or specific IP
'server.socket_port': 8085 #server port
},
'/images': { #images served as static files
'tools.staticdir.on': True,
'tools.staticdir.dir': os.path.abspath('/home/ubuntu/webserver/images')
}
}
cherrypy.quickstart(Server(), config=conf)
first off, not sure that the javascript part makes any sense, just leave it out. Also, your opening a p tag but not closing it. Not sure what your templating engine is, but you could just pass in the variables in pure python. Also, make sure to put quotes around your link. So your code should be something like:
class Server(object):
_cp_config = {
'tools.sessions.on': True,
'tools.auth.on': True
}
auth = AuthController()
#cherrypy.expose
#require()
def index(self):
f = open ("file.txt","r")
link = f.read()
f.close()
myText = "Hello World"
html = """
<html>
<body>
<p>%s</p>
</body>
</html>
""" %(myText, link)
return html
index.exposed = True
(btw, the %s things are string placeholders, that will be poplulated the variables in %(firstString, secondString) at the end of the the multi line string.

Is there a Django template filter that handles "...more" and when you click on it, it shows more of the text?

Suppose I have a huge paragraph.
I just want the top 15 words to be shown. After than, the person clicks "more" to see the rest of the stuff.
Just whipped this up, seems to do what you want, and there's no dependency on any external JS libs.
DISCLAIMER: I haven't tried this in IE, but chrome and firefox work fine.
from django import template
from django.utils.html import escape
from django.utils.safestring import mark_safe
register = template.Library()
import re
readmore_showscript = ''.join([
"this.parentNode.style.display='none';",
"this.parentNode.parentNode.getElementsByClassName('more')[0].style.display='inline';",
"return false;",
]);
#register.filter
def readmore(txt, showwords=15):
global readmore_showscript
words = re.split(r' ', escape(txt))
if len(words) <= showwords:
return txt
# wrap the more part
words.insert(showwords, '<span class="more" style="display:none;">')
words.append('</span>')
# insert the readmore part
words.insert(showwords, '<span class="readmore">... <a href="#" onclick="')
words.insert(showwords+1, readmore_showscript)
words.insert(showwords+2, '">read more</a>')
words.insert(showwords+3, '</span>')
# Wrap with <p>
words.insert(0, '<p>')
words.append('</p>')
return mark_safe(' '.join(words))
readmore.is_safe = True
To use it, just create a templatetags folder in your app, create the __init__.py file in there, and then drop this code into readmore.py.
Then at the top of any template where you want to use it, just add: {% load readmore %}
To use the filter itself:
{{ some_long_text_var|readmore:15 }}
The :15 tells how many words you want to show before the read more link.
If you want anything fancy like ajax loading of the full content, that's quite a bit different and would require a bit more infrastructure.
use truncatechars_html
refer to : https://docs.djangoproject.com/en/1.8/ref/templates/builtins/#truncatechars-html
truncatechars_html
Similar to truncatechars, except that it is aware of HTML tags. Any tags that are opened in the string and not closed before the truncation point are closed immediately after the truncation.
For example:
{{ value|truncatechars_html:9 }}
If value is "<p>Joel is a slug</p>", the output will be "<p>Joel i...</p>".
Newlines in the HTML content will be preserved.
There is truncatewords filter, although you still need a JavaScript helper to do what you described.
from django import template
from django.utils.html import escape
from django.utils.safestring import mark_safe
register = template.Library()
#register.filter
def readmore(text, cnt=250):
text, cnt = escape(text), int(cnt)
if len(text) > cnt:
first_part = text[:cnt]
link = u'%s' % _('read more')
second_part = u'%s<span class="hide">%s</span>' % (link, text[cnt:])
return mark_safe('... '.join([first_part, second_part]))
return text
readmore.is_safe = True
I rewrote an earlier answer to be cleaner and to handle string escaping properly:
#register.filter(needs_autoescape=True)
#stringfilter
def read_more(s, show_words, autoescape=True):
"""Split text after so many words, inserting a "more" link at the end.
Relies on JavaScript to react to the link being clicked and on classes
found in Bootstrap to hide elements.
"""
show_words = int(show_words)
if autoescape:
esc = conditional_escape
else:
esc = lambda x: x
words = esc(s).split()
if len(words) <= show_words:
return s
insertion = (
# The see more link...
'<span class="read-more">…'
' <a href="#">'
' <i class="fa fa-plus-square gray" title="Show All"></i>'
' </a>'
'</span>'
# The call to hide the rest...
'<span class="more hidden">'
)
# wrap the more part
words.insert(show_words, insertion)
words.append('</span>')
return mark_safe(' '.join(words))
The HTML in there assumes you're using Bootstrap and Fontawesome, but if that's not your flavor, it's easy to adapt.
For the JavaScript, assuming you're using jQuery (if you're using Bootstrap you probably are), you'll just need to add something like this:
$(".read-more").click(function(e) {
e.preventDefault();
var t = $(this);
t.parent().find('.more').removeClass('hidden');
t.addClass('hidden');
});

Categories

Resources