I tried to extract extract links (href) which start with a specific word, but it returns empty list even if I have a lot of links in the page source who satisfy the condition, I am definitely missing something, below is my code:
import requests
from bs4 import BeautifulSoup
import string
import os
import re
def extract_href_page(page):
soup = BeautifulSoup(page)
all_links = []
links = soup.find_all('a', pattern = re.compile(r'\w*first_word'))
# pattern = re.compile(r'\w*recette')
print(links)
for link in links:
all_links.append(link['href']) # Save href only, for example.
return all_links
for page_number in range(1, 63):
requete = requests.get ("https://www.website.com/pages/"+ "page".capitalize()+ "-" + str(page_number) + ".html")
page = requete.content
list_links = extract_href_page(page)
print(list_links)
for link in list_links:
print(link)
Try this:
import requests
from bs4 import BeautifulSoup
import string
import os
import re
def extract_href_page(page):
soup = BeautifulSoup(page)
all_links = []
links = soup.find_all('a', href=True)
# pattern = re.compile(r'\w*recette')
print(links)
for link in links:
if re.match(r"\w*first_word", link["href"], re.I):
all_links.append(link.get("href"))
...
Related
I am new to web scraping. I am trying to download only the weekly .zip files from the below website. I could able to parse the label and couldn't go beyond that to download the weekly zip files.
https://download.cms.gov/nppes/NPI_Files.html
html code for 'li' tag
import requests
from bs4 import BeautifulSoup
URL = "https://download.cms.gov/nppes/NPI_Files.html"
r = requests.get(URL)
#print(r.content)
soup = BeautifulSoup(r.content, 'html.parser')
#print(soup.prettify())
links=[]
ref = soup.select('li')
#print(ref)
for i in ref:
Try the next example
import requests
from bs4 import BeautifulSoup
URL = "https://download.cms.gov/nppes/NPI_Files.html"
r = requests.get(URL)
#print(r.content)
soup = BeautifulSoup(r.content, 'html.parser')
#print(soup.prettify())
links=[]
for ref in soup.select('div.bulletlistleft > ul > li')[2:]:
zip ='https://download.cms.gov/nppes'+ref.a.get('href').replace('./','/')
links.append(zip)
print(links)
Output:
['https://download.cms.gov/nppes/NPPES_Data_Dissemination_090522_091122_Weekly.zip', 'https://download.cms.gov/nppes/NPPES_Data_Dissemination_091222_091822_Weekly.zip', 'https://download.cms.gov/nppes/NPPES_Data_Dissemination_091922_092522_Weekly.zip']
Access a tags if there are any:
for i in ref:
if i.a is not None:
print(i.a.get('href'))
You can either just select all links then filter them down to the weekly updates:
ref = [
a for a in soup.select('a') if a.text
and 'Weekly Update' in a.text
]
OR you can narrow down to a parent element before selecting the links:
ref = soup.find(
'b', string='Weekly Incremental NPI Files'
).find_next('tr').select('a')
Either way, you'll end up with the same set of links:
The following code will find the links, download the files, and unzip them.
import requests
from bs4 import BeautifulSoup
import os
from urllib.request import urlretrieve
import zipfile
target_folder = os.path.dirname(os.path.realpath(__file__))
base_url = "https://download.cms.gov/nppes"
url = f"{base_url}/NPI_Files.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
elms = soup.select('div.bulletlistleft > ul > li > a')
for elm in elms:
zip_filename = elm.attrs['href'].lstrip('./')
zip_full_url = "/".join((base_url, zip_filename))
target_zip_path = os.path.join(target_folder, zip_filename)
target_zip_dir = ".".join(target_zip_path.split('.')[:-1])
urlretrieve(zip_full_url, target_zip_path)
with zipfile.ZipFile(target_zip_path, 'r') as zip_file:
zip_file.extractall(target_zip_dir)
Hey guess so I got as far as being able to add the a class to a list. The problem is I just want the href link to be added to the links_with_text list and not the entire a class. What am I doing wrong?
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id = 'hnmain')
articles = results.find_all(class_="title")
links_with_text = []
for article in articles:
link = article.find('a', href=True)
links_with_text.append(link)
print('\n'.join(map(str, links_with_text)))
This prints exactly how I want the list to print but I just want the href from every a class not the entire a class. Thank you
To get all links from the https://news.ycombinator.com, you can use CSS selector 'a.storylink'.
For example:
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
links_with_text = []
for a in soup.select('a.storylink'): # <-- find all <a> with class="storylink"
links_with_text.append(a['href']) # <-- note the ['href']
print(*links_with_text, sep='\n')
Prints:
https://blog.mozilla.org/futurereleases/2020/06/18/introducing-firefox-private-network-vpns-official-product-the-mozilla-vpn/
https://mxb.dev/blog/the-return-of-the-90s-web/
https://github.blog/2020-06-18-introducing-github-super-linter-one-linter-to-rule-them-all/
https://www.sciencemag.org/news/2018/11/why-536-was-worst-year-be-alive
https://www.strongtowns.org/journal/2020/6/16/do-the-math-small-projects
https://devblogs.nvidia.com/announcing-cuda-on-windows-subsystem-for-linux-2/
https://lwn.net/SubscriberLink/822568/61d29096a4012e06/
https://imil.net/blog/posts/2020/fakecracker-netbsd-as-a-function-based-microvm/
https://jepsen.io/consistency
https://tumblr.beesbuzz.biz/post/621010836277837824/advice-to-young-web-developers
https://archive.org/search.php?query=subject%3A%22The+Navy+Electricity+and+Electronics+Training+Series%22&sort=publicdate
https://googleprojectzero.blogspot.com/2020/06/ff-sandbox-escape-cve-2020-12388.html?m=1
https://apnews.com/1da061ce00eb531291b143ace0eed1c9
https://support.apple.com/library/content/dam/edam/applecare/images/en_US/appleid/android-apple-music-account-payment-none.jpg
https://standpointmag.co.uk/issues/may-june-2020/the-healing-power-of-birdsong/
https://steveblank.com/2020/06/18/the-coming-chip-wars-of-the-21st-century/
https://www.videolan.org/security/sb-vlc3011.html
https://onesignal.com/careers/2023b71d-2f44-4934-a33c-647855816903
https://www.bbc.com/news/world-europe-53006790
https://github.com/efficient/HOPE
https://everytwoyears.org/
https://www.historytoday.com/archive/natural-histories/intelligence-earthworms
https://cr.yp.to/2005-590/powerpc-cwg.pdf
https://quantum.country/
http://www.crystallography.net/cod/
https://parkinsonsnewstoday.com/2020/06/17/tiny-magnetically-powered-implant-may-be-future-of-deep-brain-stimulation/
https://spark.apache.org/releases/spark-release-3-0-0.html
https://arxiv.org/abs/1712.09624
https://www.washingtonpost.com/technology/2020/06/18/data-privacy-law-sherrod-brown/
https://blog.chromium.org/2020/06/improving-chromiums-browser.html
I am trying to scrape unique weblinks from a site with criteria being no duplicates. Meaning same links ending with "/" or "#" are considered duplicates. This code I got from another stack overflow thread gives me the error: TypeError: Cannot mix str and non-str arguments.
import bs4 as bs
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
BASE_url = urllib.request.urlopen("https://www.census.gov/programs-
surveys/popest.html").read()
soup = bs.BeautifulSoup(BASE_url, "html.parser")
filename = "C742TaskMisajon.csv"
f = open(filename, "w")
headers = "WebLinks\n"
f.write(headers)
all_links = soup.find_all('a')
url_set = set()
def clean_links(tags, base_url):
cleaned_links = set()
for tag in tags:
link = tag.get('href')
if link is None:
continue
if link.endswith('/') or link.endswith('#'):
link = link[-1]
full_url = urllib.parse.urljoin(base_url, link)
cleaned_links.add(full_url)
return cleaned_links
cleaned_links = clean_links(all_links, BASE_url)
for link in cleaned_links:
f.write(str(link) + '\n')
i wrote my code but it extract all links no matter what value is the seeders count,
here is the code i wrote:
from bs4 import BeautifulSoup
import urllib.request
import re
class AppURLopener(urllib.request.FancyURLopener):
version = "Mozilla/5.0"
url = input('What site you working on today, sir?\n-> ')
opener = AppURLopener()
html_page = opener.open(url)
soup = BeautifulSoup(html_page, "lxml")
pd = str(soup.findAll('td', attrs={'align':re.compile('right')}))
for link in soup.findAll('a', attrs={'href': re.compile("^magnet")}):
if not('0' is pd[18]):
print (link.get('href'),'\n')
and this is the html am working on : https://imgur.com/a/32J9qF4
in this case it's 0 seeders but it still gives me the magnet link.. HELP
This code snippet will extract all magnet links from the page, where seeders != 0:
from bs4 import BeautifulSoup
import requests
from pprint import pprint
soup = BeautifulSoup(requests.get('https://pirateproxy.mx/browse/201/1/3').text, 'lxml')
tds = soup.select('#searchResult td.vertTh ~ td')
links = [name.select_one('a[href^=magnet]')['href'] for name, seeders, leechers in zip(tds[0::3], tds[1::3], tds[2::3]) if seeders.text.strip() != '0']
pprint(links, width=120)
Prints:
['magnet:?xt=urn:btih:aa8a1f7847a49e640638c02ce851effff38d440f&dn=Affairs.of.State.2018.BRRip.x264.AC3-Manning&tr=udp%3A%2F%2Ftracker.leechers-paradise.org%3A6969&tr=udp%3A%2F%2Fzer0day.ch%3A1337&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969',
'magnet:?xt=urn:btih:819cb9b477462cd61ab6653ebc4a6f4e790589c3&dn=Bad.Samaritan.2018.BRRip.x264.AC3-Manning&tr=udp%3A%2F%2Ftracker.leechers-paradise.org%3A6969&tr=udp%3A%2F%2Fzer0day.ch%3A1337&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969',
'magnet:?xt=urn:btih:843d01992aa81d52be68190ee6a733ec9eee9b13&dn=The+Darkest+Minds+2018+HDCAM-1XBET&tr=udp%3A%2F%2Ftracker.leechers-paradise.org%3A6969&tr=udp%3A%2F%2Fzer0day.ch%3A1337&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969',
'magnet:?xt=urn:btih:09a23daa69c42003d905ecf0a1cefdb0474e7d88&dn=Insidious+The+Last+Key+2018+BRRip+x264+AAC-SSN&tr=udp%3A%2F%2Ftracker.leechers-paradise.org%3A6969&tr=udp%3A%2F%2Fzer0day.ch%3A1337&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969',
'magnet:?xt=urn:btih:98c42d5d620b4db834c5437a75f6da6f2d158207&dn=The+Darkest+Minds+2018+HDCAM-1XBET%5BTGx%5D&tr=udp%3A%2F%2Ftracker.leechers-paradise.org%3A6969&tr=udp%3A%2F%2Fzer0day.ch%3A1337&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969',
'magnet:?xt=urn:btih:f30ebc409b215f2a5237433d7508c7ebfabb0e16&dn=Journeyman.2017.SWESUB.BRRiP.x264.mp4&tr=udp%3A%2F%2Ftracker.leechers-paradise.org%3A6969&tr=udp%3A%2F%2Fzer0day.ch%3A1337&tr=udp%3A%2F%2Fopen.demonii.com%3A1337&tr=udp%3A%2F%2Ftracker.coppersurfer.tk%3A6969&tr=udp%3A%2F%2Fexodus.desync.com%3A6969',
...and so on.
EDIT:
The soup.select('#searchResult td.vertTh ~ td') will select all <td> siblings of tag <td> with class vertTh which is inside tag with id=searchResult. There are three siblings like this in each row.
The select_one('a[href^=magnet]') will then select all links that href begins with magnet.
I'm trying to extract specific classes from multiple URLs. The tags and classes stay the same but I need my python program to scrape all as I just input my link.
Here's a sample of my work:
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
url = input('insert URL here: ')
#scrape elements
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
This works for individual URLs but not for a batch. Thanks for helping me. I learned a lot from this community.
Have a list of urls and iterate through it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
If you are going to prompt user for input for each site then it can be done this way
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
urls = ['www.website1.com', 'www.website2.com', 'www.website3.com', .....]
#scrape elements
msg = 'Enter Url, to exit type q and hit enter.'
url = input(msg)
while(url!='q'):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
#print titles only
h1 = soup.find("h1", class_= "class-headline")
print(h1.get_text())
input(msg)
If you want to scrape links in batches. Specify a batch size and iterate over it.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
batch_size = 5
urllist = ["url1", "url2", "url3", .....]
url_chunks = [urllist[x:x+batch_size] for x in xrange(0, len(urllist), batch_size)]
def scrape_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
h1 = soup.find("h1", class_= "class-headline")
return (h1.get_text())
def scrape_batch(url_chunk):
chunk_resp = []
for url in url_chunk:
chunk_resp.append(scrape_url(url))
return chunk_resp
for url_chunk in url_chunks:
print scrape_batch(url_chunk)