In my code, a user inputs a search term and the get_all_links parses the html response and extract the links that start with ‘http’. When req is replaced with a hard coded url such as:
content = urllib.request.urlopen("http://www.ox.ac.uk")
The program returns a list of properly formatted links correctly. However passing in req, no links are returned. I suspect this may be a formatting blip.
Here is my code:
import urllib.request
def get_all_links(s): # function to get all the links
d=0
links=[] # getting all links into a list
while d!=-1: # untill d is -1. i.e no links in that page
d=s.find('<a href=',d) # if <a href is found
start=s.find('"',d) # stsrt will be the next character
end=s.find('"',start+1) # end will be upto "
if d!=-1: # d is not -1
d+=1
if(s[start+1]=='h'): # add the link which starts with http only.
links.append(s[start+1:end]) # to link list
return links # return list
def main():
term = input('Enter a search term: ')
url = 'http://www.google.com/search'
value = {'q' : term}
user_agent = 'Mozilla/5.0'
headers = {'User-Agent' : user_agent}
data = urllib.parse.urlencode(value)
print(data)
url = url + '?' + data
print(url)
req = urllib.request.Request(url, None, headers)
content = urllib.request.urlopen(req)
s = content.read()
print(s)
links = get_all_links(s.decode('utf-8'))
for i in links: # print the returned list.
print(i)
main()
You should use a HTML parser, as suggested in the comments. A library like BeautifulSoup is perfect for this.
I have adapted your code to use BeautifulSoup
import urllib.request
from bs4 import BeautifulSoup
def get_all_links(s):
soup = BeautifulSoup(s, "html.parser")
return soup.select("a[href^=\"http\"]") # Select all anchor tags whose href attribute starts with 'http'
def main():
term = input('Enter a search term: ')
url = 'http://www.google.com/search'
value = {'q' : term}
user_agent = 'Mozilla/5.0'
headers = {'User-Agent' : user_agent}
data = urllib.parse.urlencode(value)
print(data)
url = url + '?' + data
print(url)
req = urllib.request.Request(url, None, headers)
content = urllib.request.urlopen(req)
s = content.read()
print(s)
links = get_all_links(s.decode('utf-8'))
for i in links: # print the returned list.
print(i)
main()
It uses the select method of the BeautifulSoup library and returns a list of selected elements (in your case anchor-tags).
Using a library like BeautifulSoup not only makes it easier, but you can also use much more complex selections. Imagine how you would have to change your code when you wanted to select all links whose href attribute contains the word "google" or "code"?
You can read the BeautifulSoup documentation here.
Related
I'm trying to parse out the html to get the - 'href' link;
My code is parsing the 'href link' into separate string, but I'm hoping to get a complete string.
Here is my code:
data = requests.get("https://www.chewy.com/b/food_c332_p2",
auth = ('user', 'pass'),
headers = {'User-Agent': user_agent})
with open("dogfoodpage/dg2.html","w+") as f:
f.write(data.text)
with open("dogfoodpage/dg2.html") as f:
page = f.read()
soup = BeautifulSoup(page,"html.parser")
test = soup.find('a',class_= "kib-product-title")
productlink = []
for items in test:
for link in items.get("href"):
productlink.append(link)
Here is my output:
Here is the html structure for test:
productlink = []
for items in test:
for link in items.find_all('a', class_="kib-product-title"):
productlink.append(link.get('href'))
This should work. find method returns a single href as a string and while looping over the string you are getting URL as a list divided into characters. find_all method will get all the links and we can iterate over it to get the links.
I don't really know what to call this issue, sorry for the undescriptive title.
My program checks if a element exists on multiple paths of a website. The program has a base url that gets different paths of the domain to check, which are located in a json file (name.json).
In this current state of my program, it prints 1 if the element is found and 2 if not. I want it to print the url instead of 1 or 2. But my problem is that the id's gets saved before the final for loop. When trying to print fullurl I'm only getting the last id in my json file printed multiple times(because it isnt being saved), instead of the unique url.
import json
import grequests
from bs4 import BeautifulSoup
idlist = json.loads(open('name.json').read())
baseurl = 'https://steamcommunity.com/id/'
complete_urls = []
for uid in idlist:
fullurl = baseurl + uid
complete_urls.append(fullurl)
rs = (grequests.get(fullurl) for fullurl in complete_urls)
resp = grequests.map(rs)
for r in resp:
soup = BeautifulSoup(r.text, 'lxml')
if soup.find('span', class_='actual_persona_name'):
print('1')
else:
print('2')
Since the grequests.map return the responses in order of requests (see this), you can match the fullurl of each request to a response using enumerate.
import json
import grequests
from bs4 import BeautifulSoup
idlist = json.loads(open('name.json').read())
baseurl = 'https://steamcommunity.com/id/'
for uid in idlist:
fullurl = baseurl + uid
complete_urls = []
for uid in idlist:
fullurl = baseurl + uid
complete_urls.append(fullurl)
rs = (grequests.get(fullurl) for fullurl in complete_urls)
resp = grequests.map(rs)
for index,r in enumerate(resp): # use enumerate to get the index of response
soup = BeautifulSoup(r.text, 'lxml')
print(complete_urls[index]) # using the index of responses to access the already existing list of complete_urls
if soup.find('span', class_='actual_persona_name'):
print('1')
else:
print('2')
If I undertstood correctly you could just print(r.url) instead of the numbers since the fullurl is stored inside each response object.
for r in resp:
soup = BeautifulSoup(r.text, 'lxml')
if soup.find('span', class_='actual_persona_name'):
print(r.url)
else:
print(r.url)
So i am trying to scrape links from a random wikipedia page here is my code thus far:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib2
# function get random page
def get_random():
import requests
# r = requests.get('https://en.wikipedia.org/wiki/Special:Random')
r = requests.get('https://en.wikipedia.org/wiki/Carole_Ann')
return r.url
#========================
#finding the valid link
def validlink(href):
if href:
if re.compile('^/wiki/').search(href):
if not re.compile('/\w+:').search(href):
return True
return False
#validlink()===========
#the first site
a1 = get_random()
#print("the first site is: " + a1)
# the first site end()====
#looking for the article name:
blin = requests.get(a1)
soup = BeautifulSoup(blin.text, 'html.parser')
title = soup.find('h1', {'class' : 'firstHeading'})
print("starting website: " + a1 + " Titled: " + title.text)
print("")
#=============================
#first article done
#find body:
import re
body = requests.get(a1).text
soup = BeautifulSoup(body, 'lxml')
for link in soup.findAll("a"):
url = link.get("href", "")
print(
#======================
i know i'm doing this last part wrong. Im new to python so i just have no idea how to go about this part, what i need is to pull all of the links from a random site that the random page takes me to, then i pull the link and title off of that site,
then i need to pull the wikipedia links off of that page which is what i am looking to do in that last bit of code there heres another snip:
and at this point i want to print all of the links that it finds after they have been tested against my valid links function at the top:
again forgive me for being new and not understanding at this. But please help i cannot figure this out.
so the question that i have is: i need to create a snippet of code that will pull out all of the website links off of the wikipedia page (which note i still dont know how to do the for loop was my best guess based on my own research) then i need to test the links that i pulled against my validlink function, and print out all of the valid links.
If you whan it as list then create new list and append() url if it is valid.
Because the same url can be many times on page so I also check if url is already on list.
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and validlink(url):
valid_urls.append(url)
print(valid_urls)
from bs4 import BeautifulSoup
import requests
import re
# --- functions ---
def is_valid(url):
"""finding the valid link"""
if url:
if url.startswith('/wiki/'): # you don't need `re` to check it
if not re.compile('/\w+:').search(url):
return True
return False
# --- main ---
#random_url = 'https://en.wikipedia.org/wiki/Special:Random'
random_url = 'https://en.wikipedia.org/wiki/Carole_Ann'
r = requests.get(random_url)
print('url:', r.url)
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h1', {'class': 'firstHeading'})
print('starting website:', r.url)
print('titled:', title.text)
print()
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and is_valid(url):
valid_urls.append(url)
#print(valid_urls)
#for url in valid_urls:
# print(url)
print('\n'.join(valid_urls))
unable to fetch the url for next page. Throws traceback error. Basically i want to grab "/browse-movies?page=2"
from bs4 import BeautifulSoup
import requests
import re
url = "https://yts.ag/browse-movies?page=1"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('ul', 'tsc_pagination')[0]
for item in items:
print item
You could use range(1, 300) to iterate all pages:
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent': 'Mozilla/5.0'}
for i in range(1, 300):
url = "https://yts.ag/browse-movies?page=%s" % i
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'browse-movie-wrap')
for item in items:
for val in item.find_all('div','browse-movie-bottom'):
title = item.find_all('a','browse-movie-title')[0].text
year = item.find_all('div','browse-movie-year')[0].text
for val in item.find_all('a','browse-movie-link'):
try:
rating = val.find_all('h4')[0].text
genre = val.find_all('h4')[1].text
except:
pass
print year, rating, genre, title
P.S. You might want to add time.sleep(1) to slow down a little bit in case they block your IP for being too aggressive scraping their webpages.
Edit:
Now look for the next page URL, you could use regular expression:
import re
next_page = soup.find('a', text=re.compile(r'.*Next.*'))
print next_page['href']
So what it does is to look for an a tag which has content matches regular expression '.*Next.*'.
urls = ["https://yts.ag/browse-movies?page={}".format(i) for i in range(1, 10)] # make a url list and iterate over it
for url in urls:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
# your code here
print year, rating, genre, title
Make a URL list and iterate over it. You can change the range.
Here is the URL that I'am using:
http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i
In fact on this page, the link that I am looking for appears may be 5 second after loading the page.
I see after 5 second a post request to :
http://www.protect-stream.com/secur.php
with data like so :
k=2AE_a,LHmb6kSC_c,sZNk4eNixIiPo_c,_c,Gw4ERVdriKuHJlciB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WxBkmAtz75kpYcrHzxtYt32hCYSp0WjqOQR9bY_a,ofQtw_b,
I didn't get from where the 'k' value come from ?
Is their an idea on how we could get the 'k' value using python ?
This is not going to be trivial. The k parameter value is "hidden" deep inside a script element inside nested iframes. Here is a requests + BeautifulSoup way to get to the k value:
import re
from urlparse import urljoin
# Python 3: from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
base_url = "http://www.protect-stream.com"
with requests.Session() as session:
response = session.get("http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i")
# get the top frame url
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="frame.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the nested frame url
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="w.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the frame HTML source and extract the "k" value
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
script = soup.find("script", text=lambda text: text and "k=" in text).get_text(strip=True)
k_value = re.search(r'var k="(.*?)";', script).group(1)
print(k_value)
Prints:
YjfH9430zztSYgf7ItQJ4grv2cvH3mT7xGwv32rTy2HiB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WXhmwUC0ipkPRkLQepYHLyF1U0xvsrzHMcK2XBCeY3_a,O_b,