One quick question. I'm building a scraper that outputs emails. For now the emails get printed in a CSV file. But I want the emails to be outputted in a paragraph format on another URL. I've tried doing some things but it doesn't work out. Here is the code:
views.py
from django.shortcuts import render
from django.shortcuts import render
from . scraper import EmailCrawler
def index(request):
return render(request, 'leadfinderapp/scrape.html')
def scrape(request):
url = request.GET.get('Email')
crawl = EmailCrawler(url)
crawl.crawl()
return render(request, 'leadfinderapp/results.html')
Here is the html fie where I'm trying to output the emails (ignore the code) (results.html):
{% load static %}
<html>
{% for email in scrape %}
<p>{{ result }}</p>
{% endfor %}
</html>
email crawler (scraper.py):
import re
import requests
import requests.exceptions
from urllib.parse import urlsplit, urljoin
from lxml import html
import sys
import csv
class EmailCrawler:
processed_urls = set()
unprocessed_urls = set()
emails = set()
def __init__(self, website: str):
# processed_urls = set()
# unprocessed_urls = set()
# emails = set()
emails = set()
self.website = website
self.email = emails
self.unprocessed_urls.add(website)
self.headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/78.0.3904.70 Chrome/78.0.3904.70 Safari/537.36',
}
self.base_url = urlsplit(self.website).netloc
self.outputfile = self.base_url.replace('.','_')+'.csv'
# we will use this list to skip urls that contain one of these extension. This will save us a lot of bandwidth and speedup the crawling process
# for example: www.example.com/image.png --> this url is useless for us. we cannot possibly parse email from images and all other types of files.
self.garbage_extensions = ['.aif','.cda','.mid','.midi','.mp3','.mpa','.ogg','.wav','.wma','.wpl','.7z','.arj','.deb','.pkg','.rar','.rpm','.tar.gz','.z','.zip','.bin','.dmg','.iso','.toast','.vcd','.csv','.dat','.db','.dbf','.log','.mdb','.sav','.sql','.tar','.apk','.bat','.bin','.cgi','.pl','.exe','.gadget','.jar','.py','.wsf','.fnt','.fon','.otf','.ttf','.ai','.bmp','.gif','.ico','.jpeg','.jpg','.png','.ps','.psd','.svg','.tif','.tiff','.asp','.cer','.cfm','.cgi','.pl','.part','.py','.rss','.key','.odp','.pps','.ppt','.pptx','.c','.class','.cpp','.cs','.h','.java','.sh','.swift','.vb','.ods','.xlr','.xls','.xlsx','.bak','.cab','.cfg','.cpl','.cur','.dll','.dmp','.drv','.icns','.ico','.ini','.lnk','.msi','.sys','.tmp','.3g2','.3gp','.avi','.flv','.h264','.m4v','.mkv','.mov','.mp4','.mpg','.mpeg','.rm','.swf','.vob','.wmv','.doc','.docx','.odt','.pdf','.rtf','.tex','.txt','.wks','.wps','.wpd']
self.email_count = 0
def crawl(self):
"""
It will continue crawling untill the list unprocessed urls list is empty
"""
url = self.unprocessed_urls.pop()
print("CRAWL : {}".format(url))
self.parse_url(url)
if len(self.unprocessed_urls)!=0:
self.crawl()
else:
print('End of crawling for {} '.format(self.website))
print('Total urls visited {}'.format(len(self.processed_urls)))
print('Total Emails found {}'.format(self.emails))
def parse_url(self, current_url: str):
"""
It will load and parse a given url. Loads it and finds all the url in this page.
It also filters the urls and adds them to unprocessed url list.
Finally it scrapes the emails if found on the page and the updates the email list
INPUT:
current_url: URL to parse
RETURN:
None
"""
#we will retry to visit a url for 5 times in case it fails. after that we will skip it in case if it still fails to load
response = requests.get(current_url, headers=self.headers)
tree = html.fromstring(response.content)
urls = tree.xpath('//a/#href') # getting all urls in the page
#Here we will make sure that we convert the sub domain to full urls
# example --> /about.html--> https://www.website.com/about.html
urls = [urljoin(self.website,url) for url in urls]
# now lets make sure that we only include the urls that fall under our domain i.e filtering urls that point outside our main website.
urls = [url for url in urls if self.base_url == urlsplit(url).netloc]
#removing duplicates
urls = list(set(urls))
#filtering urls that point to files such as images, videos and other as listed on garbage_extensions
#Here will loop through all the urls and skip them if they contain one of the extension
parsed_url = []
for url in urls:
skip = False
for extension in self.garbage_extensions:
if not url.endswith(extension) and not url.endswith(extension+'/'):
pass
else:
skip = True
break
if not skip:
parsed_url.append(url)
# finally filtering urls that are already in queue or already visited
for url in parsed_url:
if url not in self.processed_urls and url not in self.unprocessed_urls:
self.unprocessed_urls.add(url)
#parsing email
self.parse_emails(response.text)
# adding the current url to processed list
self.processed_urls.add(current_url)
def parse_emails(self, text: str):
"""
It scans the given texts to find email address and then writes them to csv
Input:
text: text to parse emails from
Returns:
bool: True or false (True if email was found on page)
"""
# parsing emails and then saving to csv
emails = set(re.findall(r'[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', text, re.I))
#TODO: sometime "gFJS3amhZEg_z39D5EErVg#2x.png" gets accepted as email with the above regex. so for now i will check if email ends with jpeg,png and jpg
for email in emails:
skip_email = False
for checker in ['jpg','jpeg','png']:
if email.endswith(checker):
skip_email = True
break
if not skip_email:
if email not in self.emails:
print(email)
self.email_count +=1
self.emails.add(email)
if len(emails)!=0:
return True
else:
return False
#try:
# website = sys.argv[1]
#except:
# website = input("Please enter a website to crawl for emails:")
#crawl = EmailCrawler(website)
#crawl.crawl()
You should include the results as context in the render method!
What does your crawl() method return? If it returns a list of emails you should do something like:
# ...
email_list = crawl.crawl()
return render(request, 'leadfinderapp/results.html', {"email_list": email_list})
Then in the template you can loop over them:
{% for email in email_list %}
<p> {{ email }} </p>
{% endfor %}
EDIT
The crawl method does not return anything. Therefore my suggestion above (which assumes a list is returned) will not work...
I see that that method is called recursively, so, what I suggest is you update a class-level variable at each iteration:
define email_list in your init (or you can use you current email set perhaps?!)
append the scraped emails to that variable from within the crawl method at each iteration
then in your view:
def scrape(request):
url = request.GET.get('Email')
crawl = EmailCrawler(url)
crawl.crawl()
email_list = crawl.email_list # or whatever class-level variable you use
return render(request, 'leadfinderapp/results.html', {"email_list": email_list)
I'm creating an addon and I'm modifying some functions that come within a py file.
What I intend to do is the following, I have this code:
def channellist():
return json.loads(openfile('lib.json',pastafinal=os.path.join(tugapath,'resources')))
This code gives access to a lib.json file that is inside the tugapath folder in the resources subfolder. What I did was put the lib.json file in the dropbox and wanted to replace it with the dropbox link from the lib.json file instead of calling the folders.
I tried to change the code but without success.
def channellist():
return json.loads(openfile('lib.json',pastafinal=os.path.join("https://www.dropbox.com/s/sj1246qtiodm6qd/lib.json?dl=1')))
If someone can help me, I'm grateful!
Thank you first.
Given that your link holds valid json - which is not the case with the content you posted - you could use requests.
If the content at dropbox looked liked this:
{"tv":
{"epg": "tv",
"streams":
[{"url": "http://topchantv.net:3456/live/Stalker/Stalker/838.m3u8",
"name": "IPTV",
"resolve": False,
"visible": True}],
"name": "tv",
"thumb": "thumb_tv.png"
}
}
Then fetching the content would be like this
import requests
url = 'https://www.dropbox.com/s/sj1246qtiodm6qd/lib.json?dl=1'
r = requests.get(url)
json_object = r.json()
So if you needed it inside a function, I guess you'd input the url and return the json like so:
def channellist(url):
r = requests.get(url)
json_object = r.json()
return json_object
1) I parse some pages to get information.
2) As it information hard to detach, i install it to html page and make it beautiful with custom css.
3) Then i try to convert it to pdf to provide it to customers.
But all pdf-convectors ask for certain url, or file and so on. For example:
def parse(request):
done = csrf(request)
if request.POST:
USERNAME = request.POST.get('logins', '')
PASSWORD = request.POST.get('password', '')
dialogue_url = request.POST.get('links', '')
total_pages = int(request.POST.get('numbers', ''))
news = []
news.extend(parse_one(USERNAME, PASSWORD, dialogue_url, total_pages))
contex = {
"news" : news,
}
done.update(contex)
pageclan = render(request, 'marketing/parser.html', done)
# create an API client instance
client = pdfcrowd.Client(*** ***)
# convert a web page and store the generated PDF to a variable. That is doesn't work. Convertor doesn't support such url.
pdf = client.convertURI('pageclan')
# set HTTP response headers
response = HttpResponse(content_type="application/pdf")
response["Cache-Control"] = "max-age=0"
response["Accept-Ranges"] = "none"
response["Content-Disposition"] = "attachment; filename=jivo_log.pdf"
# send the generated PDF
response.write(pdf)
return response
Is there any tools, that can work fine?
From PDFCrowd Python API documentation:
You can also convert raw HTML code, just use the convertHtml() method instead of convertURI():
pdf = client.convertHtml("<head></head><body>My HTML Layout</body>")
which means that you can modify your code to use the convertHtml method with your rendered page (which is an HTML string):
pdf = client.convertHtml(pageclan.content)
I'm trying to modify some existing code to return the value from 'title' from an API call. Was wondering if its possible?
Example API Url: http://domain.com/rest/getSong.view?u=username&p=password&v=1.8.0&id=11452
The above URL returns:
<domain-response xmlns="http://domain.org/restapi" status="ok" version="1.8.0">
<song id="11452" parent="11044" title="The Title" album="The Album"/>
</domain-response>
Now is there a way to use python to get the 'title' value if I know the id?
Example of current code using the REST API in a file called domain.py
def get_playlist(self, playlist_id):
Addon.log('get_playlist: ' + playlist_id)
payload = self.__get_json('getPlaylist.view', {'id': playlist_id})
if payload:
songs = self.listify(payload['playlist']['entry'])
self.display_music_directory(songs)
Rest of referenced code from another file called default.py
elif Addon.plugin_queries['mode'] == 'playlist':
subsonic.get_playlist(Addon.plugin_queries['playlist_id'])
As your response is in XML format, the intuitive way to use an XML parser. Here's how to use lxml to parse your response and get the title of song with ID 11452:
from lxml import etree
s = """<domain-response xmlns="http://domain.org/restapi" status="ok" version="1.8.0">
<song id="11452" parent="11044" title="The Title" album="The Album"/>
</domain-response>"""
tree = etree.fromstring(s)
song = tree.xpath("//ns:song[#id=\'11452\']",namespaces={'ns':'http://domain.org/restapi'})
print song[0].get('title')
It's worth mentioning that there's also a dirty way to get the title if you don't care about the rest content by using regular expression:
import re
print re.compile("song id=\"11452\".*?title=\"(.*?)\"").search(s).group(1)
My python level is Novice. I have never written a web scraper or crawler. I have written a python code to connect to an api and extract the data that I want. But for some the extracted data I want to get the gender of the author. I found this web site http://bookblog.net/gender/genie.php but downside is there isn't an api available. I was wondering how to write a python to submit data to the form in the page and extract the return data. It would be a great help if I could get some guidance on this.
This is the form dom:
<form action="analysis.php" method="POST">
<textarea cols="75" rows="13" name="text"></textarea>
<div class="copyright">(NOTE: The genie works best on texts of more than 500 words.)</div>
<p>
<b>Genre:</b>
<input type="radio" value="fiction" name="genre">
fiction
<input type="radio" value="nonfiction" name="genre">
nonfiction
<input type="radio" value="blog" name="genre">
blog entry
</p>
<p>
</form>
results page dom:
<p>
<b>The Gender Genie thinks the author of this passage is:</b>
male!
</p>
No need to use mechanize, just send the correct form data in a POST request.
Also, using regular expression to parse HTML is a bad idea. You would be better off using a HTML parser like lxml.html.
import requests
import lxml.html as lh
def gender_genie(text, genre):
url = 'http://bookblog.net/gender/analysis.php'
caption = 'The Gender Genie thinks the author of this passage is:'
form_data = {
'text': text,
'genre': genre,
'submit': 'submit',
}
response = requests.post(url, data=form_data)
tree = lh.document_fromstring(response.content)
return tree.xpath("//b[text()=$caption]", caption=caption)[0].tail.strip()
if __name__ == '__main__':
print gender_genie('I have a beard!', 'blog')
You can use mechanize to submit and retrieve content, and the re module for getting what you want. For example, the script below does it for the text of your own question:
import re
from mechanize import Browser
text = """
My python level is Novice. I have never written a web scraper
or crawler. I have written a python code to connect to an api and
extract the data that I want. But for some the extracted data I want to
get the gender of the author. I found this web site
http://bookblog.net/gender/genie.php but downside is there isn't an api
available. I was wondering how to write a python to submit data to the
form in the page and extract the return data. It would be a great help
if I could get some guidance on this."""
browser = Browser()
browser.open("http://bookblog.net/gender/genie.php")
browser.select_form(nr=0)
browser['text'] = text
browser['genre'] = ['nonfiction']
response = browser.submit()
content = response.read()
result = re.findall(
r'<b>The Gender Genie thinks the author of this passage is:</b> (\w*)!', content)
print result[0]
What does it do? It creates a mechanize.Browser and goes to the given URL:
browser = Browser()
browser.open("http://bookblog.net/gender/genie.php")
Then it selects the form (since there is only one form to be filled, it will be the first):
browser.select_form(nr=0)
Also, it sets the entries of the form...
browser['text'] = text
browser['genre'] = ['nonfiction']
... and submit it:
response = browser.submit()
Now, we get the result:
content = response.read()
We know that the result is in the form:
<b>The Gender Genie thinks the author of this passage is:</b> male!
So we create a regex for matching and use re.findall():
result = re.findall(
r'<b>The Gender Genie thinks the author of this passage is:</b> (\w*)!',
content)
Now the result is available for your use:
print result[0]
You can use mechanize, see examples for details.
from mechanize import ParseResponse, urlopen, urljoin
uri = "http://bookblog.net"
response = urlopen(urljoin(uri, "/gender/genie.php"))
forms = ParseResponse(response, backwards_compat=False)
form = forms[0]
#print form
form['text'] = 'cheese'
form['genre'] = ['fiction']
print urlopen(form.click()).read()