Pycharm python AttributeError: module 'urllib3' has no attribute 'Request' - python

When I run script I get this error:
AttributeError: module 'urllib3' has no attribute 'Request'
How to install/import urllib properly to run this script?
Here's some code:
import urllib3
import csv
import re
from bs4 import BeautifulSoup
rank_page = 'https://socialblade.com/youtube/top/50/mostviewed'
request = urllib3.Request(rank_page, headers={'User-Agent': 'your user-agent'})
page = urllib3.urlopen(request)
soup = BeautifulSoup(page, 'html.parser')
channels = soup.find('div', attrs={'style': 'float: right; width: 900px;'}).find_all('div', recursive=False)[4:]
file = open('topyoutubers.csv', 'wb')
writer = csv.writer(file)
# write title row
writer.writerow(['Username', 'Uploads', 'Views'])
for channel in channels:
username = channel.find('div', attrs={'style': 'float: left; width: 350px; line-height: 25px;'}).a.text.strip()
uploads = channel.find('div', attrs={'style': 'float: left; width: 80px;'}).span.text.strip()
views = channel.find_all('div', attrs={'style': 'float: left; width: 150px;'})[1].span.text.strip()
print
username + ' ' + uploads + ' ' + views
writer.writerow([username.encode('utf-8'), uploads.encode('utf-8'), views.encode('utf-8')])
file.close()

It seem attribute 'Request' only work in Python2. Just use:
import urllib.request
urllib.request.urlopen(url, headers)

Related

Selecting all values within tag from webpage during selector

I want to select all values that belong to a specific tag using the selector function, however it seems to return only the first value given that the tag is repeated more than once. Shouldn't it select all the values under all the tags when repeated?
For example:
url = 'https://findthatlocation.com/film-title/a-knights-tale'
r = requests.get(url[1])
soup = BeautifulSoup(r.content, 'lxml')
street = list(soup.select_one("div[style='color: #999; font-size: 12px; margin-bottom: 5px;']").stripped_strings)
#or
st = soup.find('div', {'style':'color: #999; font-size: 12px; margin-bottom: 5px;'})
Only returns:
#street.text.strip()
['Prague,']
st.text.strip()
Prague,
However, more than one of the tag appears in the webpage, so I was expecting something like this:
#when using street.text.strip()
['Prague,', 'Prague Castle, Prague']
Use .select, not .select_one:
import requests
from bs4 import BeautifulSoup
url = "https://findthatlocation.com/film-title/a-knights-tale"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
out = [d.get_text(strip=True) for d in soup.select("h3 + div")]
print(out)
Prints:
['Prague,', 'Prague Castle, Prague']
code:
url = 'https://findthatlocation.com/film-title/a-knights-tale'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
# use select instead select_one
st = soup.select("div[style='color: #999; font-size: 12px; margin-bottom: 5px;']")
box=[]
for i in st:
box.append(i.text.strip())
box
return:
['Prague,', 'Prague Castle, Prague']

How to make a dataframe download through browser using python

I have a function, which generates a dataframe, which I am exporting as an excel sheet, at the end of the function.
df.to_excel('response.xlsx')
This excel file is being saved in my working directory.
Now I'm hosting this in Streamlit on heroku as a web app, but I want this excel file to be downloaded in user's local disk (a normal browser download) once this function is called. Is there a way to do it ?
Snehan Kekre, from streamlit, wrote the following solution in this thread.
streamlit as st
import pandas as pd
import io
import base64
import os
import json
import pickle
import uuid
import re
def download_button(object_to_download, download_filename, button_text, pickle_it=False):
"""
Generates a link to download the given object_to_download.
Params:
------
object_to_download: The object to be downloaded.
download_filename (str): filename and extension of file. e.g. mydata.csv,
some_txt_output.txt download_link_text (str): Text to display for download
link.
button_text (str): Text to display on download button (e.g. 'click here to download file')
pickle_it (bool): If True, pickle file.
Returns:
-------
(str): the anchor tag to download object_to_download
Examples:
--------
download_link(your_df, 'YOUR_DF.csv', 'Click to download data!')
download_link(your_str, 'YOUR_STRING.txt', 'Click to download text!')
"""
if pickle_it:
try:
object_to_download = pickle.dumps(object_to_download)
except pickle.PicklingError as e:
st.write(e)
return None
else:
if isinstance(object_to_download, bytes):
pass
elif isinstance(object_to_download, pd.DataFrame):
#object_to_download = object_to_download.to_csv(index=False)
towrite = io.BytesIO()
object_to_download = object_to_download.to_excel(towrite, encoding='utf-8', index=False, header=True)
towrite.seek(0)
# Try JSON encode for everything else
else:
object_to_download = json.dumps(object_to_download)
try:
# some strings <-> bytes conversions necessary here
b64 = base64.b64encode(object_to_download.encode()).decode()
except AttributeError as e:
b64 = base64.b64encode(towrite.read()).decode()
button_uuid = str(uuid.uuid4()).replace('-', '')
button_id = re.sub('\d+', '', button_uuid)
custom_css = f"""
<style>
#{button_id} {{
display: inline-flex;
align-items: center;
justify-content: center;
background-color: rgb(255, 255, 255);
color: rgb(38, 39, 48);
padding: .25rem .75rem;
position: relative;
text-decoration: none;
border-radius: 4px;
border-width: 1px;
border-style: solid;
border-color: rgb(230, 234, 241);
border-image: initial;
}}
#{button_id}:hover {{
border-color: rgb(246, 51, 102);
color: rgb(246, 51, 102);
}}
#{button_id}:active {{
box-shadow: none;
background-color: rgb(246, 51, 102);
color: white;
}}
</style> """
dl_link = custom_css + f'<a download="{download_filename}" id="{button_id}" href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}">{button_text}</a><br></br>'
return dl_link
vals= ['A','B','C']
df= pd.DataFrame(vals, columns=["Title"])
filename = 'my-dataframe.xlsx'
download_button_str = download_button(df, filename, f'Click here to download {filename}', pickle_it=False)
st.markdown(download_button_str, unsafe_allow_html=True)
I'd recommend searching the thread on that discussion forum. There seem to be at least 3-4 alternatives to this code.
Mark Madson has this workaround posted on github. I lifted it from the repo and am pasting here as an answer.
import base64
import pandas as pd
import streamlit as st
def st_csv_download_button(df):
csv = df.to_csv(index=False) #if no filename is given, a string is returned
b64 = base64.b64encode(csv.encode()).decode()
href = f'Download CSV File'
st.markdown(href, unsafe_allow_html=True)
usage:
st_csv_download_button(my_data_frame)
right click + save-as.
I think you can do the same by doing to_excel instead of to_csv.

how to bypass googletagmanager while scraping

When site added script googletagmanadger i cant get what I need. With this code I was scraping links from
now Im getting "www.googletagmanager.com" in every row... So I dont know how to handle with that.Thank you
[HTML][1]
[HOW CSV FILE NOW LOOK][2]
from bs4 import BeautifulSoup
import csv
import pandas as pd
from csv import writer
data_list = ["LINKI", "GOWNO", "JAJCO"]
with open('innovators.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerow(data_list)
for i in range(0,50):
#df = pd.read_csv("C:\\Users\\Lukasz\\Desktop\\PROJEKTY PYTHON\\W TRAKCIE\\bf3_strona2.csv")
#url = "https://bf3.space/" + df['LINKS'][i]
url='https://bf3.space/a-Byu6am3P'
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
rows = soup.find('iframe')
q = (rows.get('src'))
writer.writerow([q])
[1]: https://i.stack.imgur.com/Ogq0N.png
[2]: https://i.stack.imgur.com/3JYqc.png
You can use soup.find() with lambda.
For example:
import requests
from bs4 import BeautifulSoup
url = 'https://bf3.space/a-Byu6am3P'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
print( soup.find('iframe', src=lambda s: 'googletagmanager.com' not in s) )
Prints first non-googletagmanager <iframe> tag:
<iframe align="center" frameborder="0" height="1500" src="https://ven-way.x.yupoo.com/albums/83591895?uid=1" style="margin: 10px 0;padding: 0px 0px; border:none" width="100%"></iframe>

Extract URL from webpage and save to disk

I am trying to write a script to automaotmcally query sci-hub.io with an article's title and save a PDF copy of the articles full text to my computer with a specific file name.
To do this I have written the following code:
url = "http://sci-hub.io/"
data = read_csv("C:\\Users\\Sangeeta's\\Downloads\\distillersr_export (1).csv")
for index, row in data.iterrows():
try:
print('http://sci-hub.io/' + str(row['DOI']))
res = requests.get('http://sci-hub.io/' + str(row['DOI']))
print(res.content)
except:
print('NO DOI: ' + str(row['ref']))
This opens a CSV file with a list of DOI's and names of the file to be saved. For each DOI, it then queries sci-hub.io for the full-text. The presented page embeds the PDF in however I am now unsure how to extract the URL for the PDF and save it to disk.
An example of the page can be seen in the image below:
In this image, the desired URL is http://dacemirror.sci-hub.io/journal-article/3a257a9ec768d1c3d80c066186aba421/pajno2010.pdf.
How can I automatically extract this URL and then save the PDF file to disk?
When I print res.content, I get this:
b'<!DOCTYPE html>\n<html>\n <head>\n <title></title>\n <meta charset="UTF-8">\n <meta name="viewport" content="width=device-width">\n </head>\n <body>\n <style type = "text/css">\n body {background-color:#F0F0F0}\n div {overflow: hidden; position: absolute;}\n #top {top:0;left:0;width:100%;height:50px;font-size:14px} /* 40px */\n #content {top:50px;left:0;bottom:0;width:100%}\n p {margin:0;padding:10px}\n a {font-size:12px;font-family:sans-serif}\n a.target {font-weight:normal;color:green;margin-left:10px}\n a.reopen {font-weight:normal;color:blue;text-decoration:none;margin-left:10px}\n iframe {width:100%;height:100%}\n \n p.agitation {padding-top:5px;font-size:20px;text-align:center}\n p.agitation a {font-size:20px;text-decoration:none;color:green}\n\n .banner {position:absolute;z-index:9999;top:400px;left:0px;width:300px;height:225px;\n border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px}\n .banner img {border:0}\n \n p.donate {padding:0;margin:0;padding-top:5px;text-align:center;background:green;height:40px}\n p.donate a {color:white;font-weight:bold;text-decoration:none;font-size:20px}\n\n #save {position:absolute;z-index:9999;top:180px;left:8px;width:210px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:#F0F0F0;color:#333}\n\n #save a {text-decoration:none;color:white;font-size:inherit;color:#666}\n\n #save p { margin: 0; padding: 0; margin-top: 8px}\n\n #reload {position:absolute;z-index:9999;top:240px;left:8px;width:210px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:#F0F0F0;color:#333}\n\n #reload a {text-decoration:none;color:white;font-size:inherit;color:#666}\n\n #reload p { margin: 0; padding: 0; margin-top: 8px}\n\n\n #saveastro {position:absolute;z-index:9999;top:360px;left:8px;width:230px;height:70px;\n border-radius: 4px; border: solid 1px #ccc; background: white; text-align:center}\n #saveastro p { margin: 0; padding: 0; margin-top: 16px}\n \n \n #donate {position:absolute;z-index:9999;top:170px;right:16px;width:220px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:white;color:#333}\n \n #donate a {text-decoration:none;color:green;font-size:inherit}\n\n #donatein {position:absolute;z-index:9999;top:220px;right:16px;width:220px;height:36px;\n border-radius: 4px; border: solid 1px #ccc; padding: 5px;\n text-align:center;font-size:18px;background:green;color:#333}\n\n #donatein a {text-decoration:none;color:white;font-size:inherit}\n \n #banner {position:absolute;z-index:9999;top:50%;left:45px;width:250px;height:250px; padding: 0; border: solid 1px white; border-radius: 4px}\n \n </style>\n \n \n \n <script type = "text/javascript">\n window.onload = function() {\n var url = document.getElementById(\'url\');\n if (url.innerHTML.length > 77)\n url.innerHTML = url.innerHTML.substring(0,77) + \'...\';\n };\n </script>\n <div id = "top">\n \n <p class="agitation" style = "padding-top:12px">\n \xd0\xa1\xd1\x82\xd1\x80\xd0\xb0\xd0\xbd\xd0\xb8\xd1\x87\xd0\xba\xd0\xb0 \xd0\xbf\xd1\x80\xd0\xbe\xd0\xb5\xd0\xba\xd1\x82\xd0\xb0 Sci-Hub \xd0\xb2 \xd1\x81\xd0\xbe\xd1\x86\xd0\xb8\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xbd\xd1\x8b\xd1\x85 \xd1\x81\xd0\xb5\xd1\x82\xd1\x8f\xd1\x85 \xe2\x86\x92 <a target="_blank" href="https://vk.com/sci_hub">vk.com/sci_hub</a>\n </p>\n \n </div>\n \n <div id = "content">\n <iframe src = "http://moscow.sci-hub.io/202d9ebdfbb8c0c56964a31b2fdfe8e9/roerdink2016.pdf" id = "pdf"></iframe>\n </div>\n \n <div id = "donate">\n <p><a target = "_blank" href = "//sci-hub.io/donate">\xd0\xbf\xd0\xbe\xd0\xb4\xd0\xb4\xd0\xb5\xd1\x80\xd0\xb6\xd0\xb0\xd1\x82\xd1\x8c \xd0\xbf\xd1\x80\xd0\xbe\xd0\xb5\xd0\xba\xd1\x82 →</a></p>\n </div>\n <div id = "donatein">\n <p><a target = "_blank" href = "//sci-hub.io/donate">support the project →</a></p>\n </div>\n <div id = "save">\n <p>\xe2\x87\xa3 \xd1\x81\xd0\xbe\xd1\x85\xd1\x80\xd0\xb0\xd0\xbd\xd0\xb8\xd1\x82\xd1\x8c \xd1\x81\xd1\x82\xd0\xb0\xd1\x82\xd1\x8c\xd1\x8e</p>\n </div>\n <div id = "reload">\n <p>↻ \xd1\x81\xd0\xba\xd0\xb0\xd1\x87\xd0\xb0\xd1\x82\xd1\x8c \xd0\xb7\xd0\xb0\xd0\xbd\xd0\xbe\xd0\xb2\xd0\xbe</p>\n </div>\n \n \n<!-- Yandex.Metrika counter --> <script type="text/javascript"> (function (d, w, c) { (w[c] = w[c] || []).push(function() { try { w.yaCounter10183018 = new Ya.Metrika({ id:10183018, clickmap:true, trackLinks:true, accurateTrackBounce:true, ut:"noindex" }); } catch(e) { } }); var n = d.getElementsByTagName("script")[0], s = d.createElement("script"), f = function () { n.parentNode.insertBefore(s, n); }; s.type = "text/javascript"; s.async = true; s.src = "https://mc.yandex.ru/metrika/watch.js"; if (w.opera == "[object Opera]") { d.addEventListener("DOMContentLoaded", f, false); } else { f(); } })(document, window, "yandex_metrika_callbacks"); </script> <noscript><div><img src="https://mc.yandex.ru/watch/10183018?ut=noindex" style="position:absolute; left:-9999px;" alt="" /></div></noscript> <!-- /Yandex.Metrika counter -->\n </body>\n</html>\n'
Which does include the URL, however I am unsure how to extract it.
Update:
I am now able to extract the URL but when I try to access the page with the PDF (through urllib.request) I get a 403 response even though the URL is valid. Any ideas on why and how to fix? (I am able to access through my browser so not IP blocked)
You can use urllib library to access the html of the page and even download files, and regex to find the url of the file you want to download.
import urllib
import re
site = urllib.urlopen(".../index.html")
data = site.read() # turns the contents of the site into a string
files = re.findall('(http|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,#?^=%&:/~+#-]*[\w#?^=%&/~+#-])?(.pdf)', data) # finds the url
for file in files:
urllib.urlretrieve(file, filepath) # "filepath" is where you want to save it
Here is the Solution:-
url = re.search('<iframe src = "\s*([^"]+)"', res.content)
url.group(1)
urllib.urlretrieve(url.group(1),'C:/.../Docs/test.pdf')
I ran it and it is working :)
For Python 3:
Change urrlib.urlretrive to urllib.request.urlretrieve
You can do it with a clunky code requiring selenium, requests and scrapy.
Use selenium to request either an article title or DOI.
>>> from selenium import webdriver
>>> driver.get("http://sci-hub.io/")
>>> input_box = driver.find_element_by_name('request')
>>> input_box.send_keys('amazing scientific results\n')
An article by the title 'amazing scientific results' doesn't seem to exist. As a result, the site returns a diagnostic page in the browser window which we can ignore. It also puts 'http://sci-hub.io/' in webdriver's current_url property. This is helpful because it's an indication that the requested result isn't available.
>>> driver.current_url
'http://sci-hub.io/'
Let's try again, looking for the item that you know exists.
>>> driver.get("http://sci-hub.io/")
>>> input_box = driver.find_element_by_name('request')
>>> input_box.send_keys('DOI: 10.1016/j.anai.2016.01.022\n')
>>> driver.current_url
'http://sci-hub.io/10.1016/j.anai.2016.01.022'
This time the site returns a distinctive url. Unfortunately, if we load this using selenium we will get the pdf and, unless you're more able than I am, you will find it difficult to download this to a file on your machine.
Instead, I download it using the requests library. Loaded in this form you will find that the url of the pdf becomes visible in the HTML.
>>> import requests
>>> r = requests.get(driver.current_url)
To ferret out the url I use scrapy.
>>> from scrapy.selector import Selector
>>> selector = Selector(text=r.text)
>>> pdf_url = selector.xpath('.//iframe/#src')[0].extract()
Finally I use requests again to download the pdf so that I can save it to a conveniently named file on local storage.
>>> r = requests.get(pdf_url).content
>>> open('article_name', 'wb').write(r)
211853
I solved this using a combination of the answers above - namely SBO7 & Roxerg.
I use the following to extract the URL from the page and then download the PDF:
res = requests.get('http://sci-hub.io/' + str(row['DOI']))
useful = BeautifulSoup(res.content, "html5lib").find_all("iframe")
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(useful[0]))
response = requests.get(urls[0])
with open("C:\\Users\\Sangeeta's\\Downloads\\ref\\" + str(row['ref']) + '.pdf', 'wb') as fw:
fw.write(response.content)
Note: This will not work for all articles - some link to webpages (example) and this doesn't correctly work for those.

Extracting multiple URLs with no 'a' or 'href' tags from web page with BS4

I'm making a simple program with Selenium that goes to Flickr.com, searches for the term the user inputs, and then prints out the URLs of all those images.
I'm struggling on the final part, getting just the URLs of the images. I've been using the class_= search to get the portion of the HTML where the URLs are. This returns the following multiple times when searching for 'apples':
<div class="view photo-list-photo-view requiredToShowOnServer awake"
data-view-signature="photo-list-photo-view__engagementModelName_photo-lite-
models__excludePeople_false__id_6246270647__interactionViewName_photo-list-
photo-interaction- view__isOwner_false__layoutItem_1__measureAFT_true__model_1__modelParams_1_ _parentContainer_1__parentSignature_photolist-
479__requiredToShowOnClient_true__requiredToShowOnServer_true__rowHeightMod _1__searchTerm_apples__searchType_1__showAdvanced_true__showSort_true__show Tools_true__sortMenuItems_1__unifiedSubviewParams_1__viewType_jst"
style="transform: translate(823px, 970px); -webkit-transform: translate(823px, 970px); -ms-transform: translate(823px, 970px); width:
237px; height: 178px; background-image:
url(//c3.staticflickr.com/7/6114/6246270647_edc7387cfc_m.jpg)">
<div class="interaction-view"></div>
All I want is for the URL of each image to be like this:
c3.staticflickr.com/7/6114/6246270647_edc7387cfc_m.jpg
Since there are no a or href trags I'm struggling to filter them out.
I tried doing some regex as well at the end such as the following:
print(soup.find_all(re.compile(r'^url\.jpg$')))
But that didn't work.
Here's my full code below anyway, thanks.
import os
import re
import urllib.request as urllib2
import bs4
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
os.makedirs('My_images', exist_ok=True)
browser = webdriver.Chrome()
browser.implicitly_wait(10)
print("Opening Flickr.com")
siteChoice = 'http://www.flickr.com'
browser.get(siteChoice)
print("Enter your search term: ")
term = input("> ")
searchField = browser.find_element_by_id('search-field')
searchField.send_keys(term)
searchField.submit()
url = siteChoice + '/search/?text=' + term
html = urllib2.urlopen(url)
soup = bs4.BeautifulSoup(html, "html.parser")
print(soup.find_all(class_='view photo-list-photo-view requiredToShowOnServer awake', style = re.compile('staticflickr')))
my changed code:
p = re.compile(r'url\(\/\/([^\)]+)\)')
test_str = str(soup)
all_urls = re.findall(p, test_str)
print('Exporting to file')
with open('flickr_urls.txt', 'w') as f:
for i in all_urls:
f.writelines("%s\n" % i)
print('Done')
Try this
url\(\/\/([^\)]+)\)
Demo
import re
p = re.compile(ur'url\(\/\/([^\)]+)\)')
test_str = u"<div class=\"view photo-list-photo-view requiredToShowOnServer awake\" \ndata-view-signature=\"photo-list-photo-view__engagementModelName_photo-lite-\nmodels__excludePeople_false__id_6246270647__interactionViewName_photo-list-\nphoto-interaction- view__isOwner_false__layoutItem_1__measureAFT_true__model_1__modelParams_1_ _parentContainer_1__parentSignature_photolist-\n479__requiredToShowOnClient_true__requiredToShowOnServer_true__rowHeightMod _1__searchTerm_apples__searchType_1__showAdvanced_true__showSort_true__show Tools_true__sortMenuItems_1__unifiedSubviewParams_1__viewType_jst\"\n style=\"transform: translate(823px, 970px); -webkit-transform: translate(823px, 970px); -ms-transform: translate(823px, 970px); width:\n 237px; height: 178px; background-image:\n url(//c3.staticflickr.com/7/6114/6246270647_edc7387cfc_m.jpg)\">\n<div class=\"interaction-view\"></div>"
m = re.search(p, test_str)
print m.group(1)
Output:
c3.staticflickr.com/7/6114/6246270647_edc7387cfc_m.jpg
To scrap all the png/jpg links from a page with Selenium :
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.flickr.com/")
links = driver.execute_script("return document.body.innerHTML.match(" \
"/https?:\/\/[a-z_\/0-9\-\#=&.\#]+\.(jpg|png)/gi)")
print links

Categories

Resources