I'm writing code that is attempting to extract the text from the Library of Babel.
They basically use a system of Hexes, Walls, Shelfs, Volumes and Pages to split up their library of randomly generated text files. Here is an example (https://libraryofbabel.info/book.cgi?2-w1-s2-v22:1)
Here we have Hex: 2, Wall: 1, Shelf: 2, Volume: 22, Page: 1.
I would ideally like to randomly generate a page across all these variables to extract text from, however I am not getting the output I would imagine.
Here is my code:
import requests
from bs4 import BeautifulSoup
from urlparse import urlparse
import random
hex = str(random.randint(0, 6))
wall = str(random.randint(1, 4))
shelf = str(random.randint(1, 5))
vol = str(random.randint(1, 32))
page = str(random.randint(1, 410))
print("Fetching: " + " Hex: " + hex + ", Wall: " + wall + ", Shelf: " + shelf + ", Vol: " + vol + ", Page: " + page)
babel_url = str("https://libraryofbabel.info/browse.cgi?" + hex + "-w" + wall + "-s" + shelf + "-v" + vol + ":" + page)
r = requests.get(babel_url)
soup = BeautifulSoup(r.text)
print(soup.get_text())
My output would be identical to that if I changed the url to be https://libraryofbabel.info/browse.cgi. print(babel_url) shows me that the way I wrote the url is fine but something isn't interpreting what I have written in the way I want.
I've found that just pasting https://libraryofbabel.info/book.cgi?2-w1-s2-v22:1 into chrome drops me at https://libraryofbabel.info/book.cgi. But if I navigate to https://libraryofbabel.info/book.cgi?2-w1-s2-v22:1 (or any other page) I can move between pages at will.
The only thing I get in the output worth mentioning is:
It appears your browser has javascript disabled. Follow this link to browse without javascript.
Put on you glasses :
You are requesting browse.cgi instead of book.cgi
https://libraryofbabel.info/browse.cgi?2-w2-s1-v10:72
instead of
https://libraryofbabel.info/book.cgi?2-w2-s1-v10:72
Related
I wrote a code that allows to test if a link is broken for a selected website.
Here is the code I tried to change into version python 3:
"""
This script allows to test if a link is broken.
It will test all the links, external or internal, from the website.
Also, it will give you the number of broken links.
"""
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
import requests
# from urllib.parse import urljoin
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import sys
# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()
# number of urls visited so far will be stored here
total_urls_visited = 0
total_broken_link = set()
output = 'output.txt'
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
"""
Almost any value is evaluated to True if it has some sort of content.
Every Url should follow a specific format: <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
Example: http://www.example.com/index?search=src
Here, www.example.com is your netloc, while index is the path,
search is the query parameter, and src is the value being passed along the parameter search.
This will make sure that a proper scheme (protocol, e.g http or https) and domain name exists in the URL.
"""
def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`, we use python set() cause we don't redondant links
urls = set()
# domain name of the URL without the protocol, to check if the link is internal or external
domain_name = urlparse(url).netloc
#Python library for pulling data out of HTML or XML files
soup = BeautifulSoup(requests.get(url).content, "html.parser",from_encoding="iso-8859-1")
# print(soup.prettify()) #test if the html of the page is correctly displaying
# print(soup.find_all('a')) #collect all the anchor tag
for a_tag in soup.findAll("a"):
href = a_tag.get("href")
if href == "" or href is None:
# href empty tag
continue
href = urljoin(url, href) #internal urls
#print(internal_urls)
# print('href:' + href)
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
# print("External link:" + href)
# print((requests.get(href)).status_code)
is_broken_link(href, url)
external_urls.add(href)
continue
# print("Internal link:" + href)
# print((requests.get(href)).status_code)
is_broken_link(href, url)
urls.add(href) #because it is not an external link
internal_urls.add(href) #because it is not an external link
return urls
def is_broken_link(url, origin_url):
if ((requests.get(url)).status_code) != 200:
#print("This link is broken")
print(('|' + url.encode('utf-8').center(60) + '|' + origin_url.encode('utf-8').center(60) + '|' + '\n'))
total_broken_link.add(url)
return True
else:
#print("This link works well")
return False
def crawl(url, max_urls=80):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl.
"""
global total_urls_visited
total_urls_visited += 1
links = get_all_website_links(url)
for link in links:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
if __name__ == "__main__":
print(('\n' + '|' + color.BOLD + color.RED + "Broken links".center(60) + color.END + '|' + color.BOLD + color.YELLOW + "Origin of the link".center(60) + color.END + '|'))
print(('+' + '-'.center(60,'-') + '+' + '-'.center(60,'-') + '+'))
if len(sys.argv) <= 1:
crawl('http://localhost:1313/')
else:
crawl(sys.argv[1])
print(('Total External links:' + str(len(external_urls))))
print(('Total Internal links:' + str(len(internal_urls))))
print(('Total:' + str(len(external_urls) + len(internal_urls))))
print(('Be careful: ' + color.BOLD + color.PURPLE + str(len(total_broken_link)) + color.END + ' broken links found !'))
Also I am using a docker, so what I tell to my docker to install while running the script is the following command:
RUN python -m pip install requests beautifulsoup4
So when I run my code I got this return:
Traceback (most recent call last):
File "/home/camille/workspace/test-link/test-link.py", line 124, in <module>
crawl(sys.argv[1])
File "/home/camille/workspace/test-link/test-link.py", line 115, in crawl
crawl(link, max_urls=max_urls)
File "/home/camille/workspace/test-link/test-link.py", line 111, in crawl
links = get_all_website_links(url)
File "/home/camille/workspace/test-link/test-link.py", line 86, in get_all_website_links
is_broken_link(href, url)
File "/home/camille/workspace/test-link/test-link.py", line 94, in is_broken_link
print(('|' + url.encode('utf-8').center(60) + '|' + origin_url.encode('utf-8').center(60) + '|' + '\n'))
TypeError: can only concatenate str (not "bytes") to str
I think I changed everything that needed to be change in order to go on version python 3 and I am very confused about this "bytes" concatenation. In fact, if I remember well I got the same issue with python 2 but I solved it. I don't know why the issue is back on python 3.
Thank you for helping.
url.encode('utf-8') returns bytes, not a string. In Python 2, ordinary strings like 'hello' or '|' where bytes and could therefore be concatenated with other bytes. Now you're trying to concatenate bytes (url.encode('utf-8')) with strings ('|'). To fix your problem, all you need to do is remove the encode method.
print ('|' + url.center(60) + '|' + origin_url.center(60) + '|' + '\n')
I'm trying to write a script that scrapes the text of multiple webpages with slightly differing URLs. I want to go through the pages with an np.arange function that inserts a string into the URL. But there must be something wrong with the URL the script is composing. In the document, that stores the scraped text, it scrapes just messages like "this site does not exist anymore". The steps I have taken to come closer to the solution are detailed below. Here is my code.
from bs4 import BeautifulSoup
import numpy as np
import datetime
from time import sleep
from random import randint
datum = datetime.datetime.now()
pages = np.arange(1, 20, 1)
datum_jetzt = datum.strftime("%Y") + "-" + datum.strftime("%m") + "-" + datum.strftime("%d")
url = "https://www.shabex.ch/pub/" + datum_jetzt + "/index-"
results = requests.get(url)
file_name = "" + datum.strftime("%Y") + "-" + datum.strftime("%m") + "-" + datum.strftime("%d") + "-index.htm"
for page in pages:
page = requests.get("https://www.shabex.ch/pub/" + datum_jetzt + "/index-" + str(page) + ".htm")
soup = BeautifulSoup(results.text, "html.parser")
texte = soup.get_text()
sleep(randint(2,5))
f = open(file_name, "a")
f.write(texte)
f.close
I found that if I find enter print("https://www.shabex.ch/pub/" + datum_jetzt + "/index-" + str(page) + ".htm") in the console, I get https://www.shabex.ch/pub/2020-05-18/index-<Response [200]>.htm. So the np.arange function returns the response of the webserver instead of the value I seek.
Where have I gone wrong?
I Want To Scrape 70 character in this HTML code:
<p>2) Proof of payment emailed to satrader03<strong>#gmail.com</strong> direct from online banking 3) Selfie of you holding your ID 4) Selfie of you holding your bank card from which payment will be made OR 5) Skype or what's app Video call while logged onto online banking displaying account name which should match personal verified name Strictly no 3rd party payments</p>
I Want To Know How To Scrape Specific Character with selenium for example i want to scrape 30 character or other
Here is my code:
description = driver.find_elements_by_css_selector("p")
items = len(title)
with open('btc_gmail.csv','a',encoding="utf-8") as s:
for i in range(items):
s.write(str(title[i].text) + ',' + link[i].text + ',' + description[i].text + '\n')
How to scrape 30 characters or 70 or something
Edit (full code):
driver = webdriver.Firefox()
r = randrange(3,7)
for url_p in url_pattren:
time.sleep(3)
url1 = 'https://www.bing.com/search?q=site%3alocalbitcoins.com+%27%40gmail.com%27&qs=n&sp=-1&pq=site%3alocalbitcoins+%27%40gmail.com%27&sc=1-31&sk=&cvid=9547A785CF084BAE94D3F00168283D1D&first=' + str(url_p) + '&FORM=PERE3'
driver.get(url1)
time.sleep(r)
title = driver.find_elements_by_tag_name('h2')
link = driver.find_elements_by_css_selector("cite")
description = driver.find_elements_by_css_selector("p")
items = len(title)
with open('btc_gmail.csv','a',encoding="utf-8") as s:
for i in range(items):
s.write(str(title[i].text) + ',' + link[i].text + ',' + description[i].text[30:70] + '\n')
Any Solution?
You can get text of the tag and then use slice on string
>>> description = driver.find_elements_by_css_selector("p")[0].text
>>> print(description[30:70]) # printed from 30th to 70th symbol
'satrader03<strong>#gmail.com</strong>'
I've never used Python before so excuse my lack of knowledge but I'm trying to scrape a xenforo forum for all of the threads. So far so good, except for the fact its picking up multiple URLs for each page of the same thread, I've posted some data before to explain what I mean.
forums/my-first-forum/: threads/my-gap-year-uni-story.13846/
forums/my-first-forum/: threads/my-gap-year-uni-story.13846/page-9
forums/my-first-forum/: threads/my-gap-year-uni-story.13846/page-10
forums/my-first-forum/: threads/my-gap-year-uni-story.13846/page-11
Really, what I would ideally want to scrape is just one of these.
forums/my-first-forum/: threads/my-gap-year-uni-story.13846/
Here is my script:
from bs4 import BeautifulSoup
import requests
def get_source(url):
return requests.get(url).content
def is_forum_link(self):
return self.find('special string') != -1
def fetch_all_links_with_word(url, word):
source = get_source(url)
soup = BeautifulSoup(source, 'lxml')
return soup.select("a[href*=" + word + "]")
main_url = "http://example.com/forum/"
forumLinks = fetch_all_links_with_word(main_url, "forums")
forums = []
for link in forumLinks:
if link.has_attr('href') and link.attrs['href'].find('.rss') == -1:
forums.append(link.attrs['href']);
print('Fetched ' + str(len(forums)) + ' forums')
threads = {}
for link in forums:
threadLinks = fetch_all_links_with_word(main_url + link, "threads")
for threadLink in threadLinks:
print(link + ': ' + threadLink.attrs['href'])
threads[link] = threadLink
print('Fetched ' + str(len(threads)) + ' threads')
This solution assumes that what should be removed from the url to check for uniqueness is always going to be "/page-#...". If that is not the case this solution will not work.
Instead of using a list to store your urls you can use a set, which will only add unique values. Then in the url remove the last instance of "page" and anything that comes after it if it is in the format of "/page-#", where # is any number, before adding it to the set.
forums = set()
for link in forumLinks:
if link.has_attr('href') and link.attrs['href'].find('.rss') == -1:
url = link.attrs['href']
position = url.rfind('/page-')
if position > 0 and url[position + 6:position + 7].isdigit():
url = url[:position + 1]
forums.add(url);
Im using BeautifulSoup to scrape some web contents.
Im learning with this example code,but I always get a "None" response.
Code:
import urllib2
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(urllib2.urlopen('http://www.velocidadcuchara.com/2011/08/helado-platano-light.html').read())
post = soup.find('div', attrs={'id': 'topmenucontainer'})
print post
Any idea what Im doing wrong ?
Thanks!!
I don't think you are doing anything wrong.
It is the second script tag that is confusing BeautifulSoup. The tag looks like this:
<script type='text/javascript'>
<!--//--><![CDATA[//><!--
var arVersion = navigator.appVersion.split("MSIE")
var version = parseFloat(arVersion[1])
function fixPNG(myImage)
{
if ((version >= 5.5) && (version < 7) && (document.body.filters))
{
var imgID = (myImage.id) ? "id='" + myImage.id + "' " : ""
var imgClass = (myImage.className) ? "class='" + myImage.className + "' " : ""
var imgTitle = (myImage.title) ?
"title='" + myImage.title + "' " : "title='" + myImage.alt + "' "
var imgStyle = "display:inline-block;" + myImage.style.cssText
var strNewHTML = "<span " + imgID + imgClass + imgTitle
+ " style=\"" + "width:" + myImage.width
+ "px; height:" + myImage.height
+ "px;" + imgStyle + ";"
+ "filter:progid:DXImageTransform.Microsoft.AlphaImageLoader"
+ "(src=\'" + myImage.src + "\', sizingMethod='scale');\"></span>"
myImage.outerHTML = strNewHTML
}
}
//--><!]]>
</script>
but BeatifulSoup seems to think it is still in a comment or something and includes the rest of the file as content of the script tag.
Try:
print str(soup.findAll('script')[1])[:2000]
and you'll see what I mean.
If you remove the CDATA then you should find the page parses correctly:
soup = BeautifulSoup(
urllib2.urlopen('http://www.velocidadcuchara.com/2011/08/helado-platano-light.html')
.read()
.replace('<![CDATA[', '').replace('<!]]>', ''))
Something weird with your HTML. BeautifulSoup tries its best, but sometimes it just can't parse it.
Try moving the first <link> element inside the <head>, that might help.
You could try to use lxml lib.
lxml article
from lxml.html import parse
doc = parse('http://java.sun.com').getroot()
post = doc.cssselect('div#topmenucontainer')