I am writing a script that will scrape a newsletter for URLs. There are some URLs in the newsletter that are irrelevant (e.g. links to articles, mailto links, social links, etc.). I added some logic to remove those links, but for some reason not all of them are being removed. Here is my code:
from os import remove
from turtle import clear
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
termSheetLinks = []
for companyURL in htmlParser.select("table#templateBody p > a"):
termSheetLinks.append(companyURL.get('href'))
for link in termSheetLinks:
if "fortune.com" in link in termSheetLinks:
termSheetLinks.remove(link)
if "forbes.com" in link in termSheetLinks:
termSheetLinks.remove(link)
if "twitter.com" in link in termSheetLinks:
termSheetLinks.remove(link)
print(termSheetLinks)
When I ran it most recently, this was my output, despite trying to remove all links containing "fortune.com":
['https://fortune.com/company/blackstone-group?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://fortune.com/company/tpg?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://casproviders.org/asd-guidelines/', 'https://fortune.com/company/carlyle-group?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://ir.carlyle.com/static-files/433abb19-8207-4632-b173-9606698642e5', 'mailto:termsheet#fortune.com', 'https://www.afresh.com/', 'https://www.geopagos.com/', 'https://montana-renewables.com/', 'https://descarteslabs.com/', 'https://www.dealer-pay.com/', 'https://www.sequeldm.com/', 'https://pueblo-mechanical.com/', 'https://dealcloud.com/future-proof-your-firm/', 'https://apartmentdata.com/', 'https://www.irobot.com/', 'https://www.martin-bencher.com/', 'https://cell-matters.com/', 'https://www.lever.co/', 'https://www.sigulerguff.com/']
Any help would be greatly appreciated!
It do not need a regex in my opinion - Instead of removing the urls, append only those to a list that do not contain your substrings, eg with a list comprehension:
[companyURL.get('href') for companyURL in htmlParser.select("table#templateBody p > a") if not any(x in companyURL.get('href') for x in ["fortune.com","forbes.com","twitter.com"])]
Example
from bs4 import BeautifulSoup
import requests
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
myList = ["fortune.com","forbes.com","twitter.com"]
[companyURL.get('href') for companyURL in htmlParser.select("table#templateBody p > a")
if not any(x in companyURL.get('href') for x in myList)]
Output
['https://casproviders.org/asd-guidelines/',
'https://ir.carlyle.com/static-files/433abb19-8207-4632-b173-9606698642e5',
'https://www.afresh.com/',
'https://www.geopagos.com/',
'https://montana-renewables.com/',
'https://descarteslabs.com/',
'https://www.dealer-pay.com/',
'https://www.sequeldm.com/',
'https://pueblo-mechanical.com/',
'https://dealcloud.com/future-proof-your-firm/',
'https://apartmentdata.com/',
'https://www.irobot.com/',
'https://www.martin-bencher.com/',
'https://cell-matters.com/',
'https://www.lever.co/',
'https://www.sigulerguff.com/']
Removing the links after the for iterator will not skip any entry.
from os import remove
from turtle import clear
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
termSheetLinks = []
for companyURL in htmlParser.select("table#templateBody p > a"):
termSheetLinks.append(companyURL.get('href'))
lRemove = []
for link in termSheetLinks:
if "fortune.com" in link:
lRemove.append(link)
if "forbes.com" in link:
lRemove.append(link)
if "twitter.com" in link:
lRemove.append(link)
for l in lRemove:
termSheetLinks.remove(l)
print(termSheetLinks)
Related
from bs4 import BeautifulSoup
import requests
url="https://bararanonline.com/letter/%D5%A1?page=1"
response=requests.get(url)
soup=BeautifulSoup(response.content, "lxml")
words=soup.find_all('a',"word-href")
for word in words:
print(word.text)
So, I got the first page. Now, I want to scrape information from all pages and I have to put URL page number in {} (page={}), but I can't figure out how to do it.
Thanks in advance.
Simply define a for loop and set your range() parameters:
from bs4 import BeautifulSoup
import requests
url="https://bararanonline.com/letter/%D5%A1?page="
words = []
for i in range(1, 3):
response=requests.get(f'{url}{i}')
##or as olvin roght mentioned by setting params
##response=requests.get("https://bararanonline.com/letter/ա", params={"page": i})
soup=BeautifulSoup(response.content, "lxml")
words.extend([word.text.strip() for word in soup.find_all('a',"word-href")])
words
Alternativ is to go by while - Example starts with 207 just to show, that it stops, if there is no next page, but you can change it if you like:
from bs4 import BeautifulSoup
import requests
url="https://bararanonline.com/letter/%D5%A1?page=207"
words = []
while True:
response=requests.get(url)
soup=BeautifulSoup(response.content, "lxml")
words.extend([word.text.strip() for word in soup.find_all('a',"word-href")])
if a := soup.select_one('a[rel="next"]'):
url = a['href']
else:
break
words
I have build a list which contains href from website and i wanna randomly select one of this link, how can i do that?
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles = a['href']
x = random.choice(articles)
print(x)
That code work, but selecting only random index from all of the objects
You're very close to the answer. You just need to do this:
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles.append(a['href'])
x = random.choice(articles)
print(x)
getItems()
The changes are:
We add each article to the articles array.
The random choice is now done after the loop, rather than inside the loop.
I want to check if any of the excluded sites show up. I can get it to work with just one site, but as soon as I make it a list, it errors at if donts in thingy:
TypeError: 'in ' requires string as left operand, not tuple"
This is my code:
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if donts in thingy:
pass
else:
print (thingy)
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if thingy in donts :
print (thingy)
else:
pass
Judge: string in tuple
The crux of your problem is how you're searching your excluded list:
excluded = ("a", "b", "c")
links = ["a", "d", "e"]
for site in links:
if site not in excluded: # We want to know if the site is in the excluded list
print(f"Site not excluded: {site}")
Reverse the order of your elements and this should work fine. I've inverted your logic here so you can skip the unnecessary pass.
As a side note, this is one reason clear variable names can help - they will help you reason about what the logic should be doing. Especially in Python where ergonomics like in exist, this is very useful.
import requests
from bs4 import BeautifulSoup
from lxml import html, etree
import sys
import re
url = ("http://stackoverflow.com")
donts = ('stackoverflow.com', 'stackexchange.com')
r = requests.get(url, timeout=6, verify=True)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.select('a[href*="http"]'):
thingy = (link.get('href'))
thingy = str(thingy)
if any(d in thingy for d in donts):
pass
else:
print (thingy)
I'm trying to modify an existing html file so that specific keywords are printed as strong (no matter where they appear).
My attempt:
from bs4 import BeautifulSoup as soup
txt = """<html><head><style></style></head><body><h2>"This is my keyword</h2><table><tr><td>This could be another instance of the keyword.</td></tr></table></body></html>"""
buzz_words = ["keyword", "apples"]
htmlSoup = soup(txt, features="html.parser")
for word in buzz_words:
target = htmlSoup.find_all(text=re.compile(r"" + re.escape(word)))
for v in target:
v.replace_with(v.replace(word, "".join(["<strong>", word, "</strong>"])))
print(str(htmlSoup))
Result:
This is my < ;strong> ;keyword< ;/strong> ;(spaces added by me)
Desired result:
This is my <strong>keyword</strong>
Try the following
from bs4 import BeautifulSoup as soup
import re
import html
txt = """<html><head><style></style></head><body><h2>"This is my keyword</h2><table><tr><td>This could be another instance of the keyword.</td></tr></table></body></html>"""
buzz_words = ["keyword", "apples"]
htmlSoup = soup(txt, features="html.parser")
for word in buzz_words:
target = htmlSoup.find_all(text=re.compile(r"" + re.escape(word)))
for v in target:
v.replace_with(v.replace(word, "".join(["<strong>", word, "</strong>"])))
print(html.unescape(str(htmlSoup.prettify())))
The script used to work, but no longer and I can't figure out why. I am trying to go to the link and extract/print the religion field. Using firebug, the religion field entry is within the 'tbody' then 'td' tag-structure. But now the script find "none" when searching for these tags. And I also look at the lxml by 'print Soup_FamSearch' and I couldn't see any 'tbody' and 'td' tags appeared on firebug.
Please let me know what I am missing?
import urllib2
import re
import csv
from bs4 import BeautifulSoup
import time
from unicodedata import normalize
FamSearchURL = 'https://familysearch.org/pal:/MM9.1.1/KH21-211'
OpenFamSearchURL = urllib2.urlopen(FamSearchURL)
Soup_FamSearch = BeautifulSoup(OpenFamSearchURL, 'lxml')
OpenFamSearchURL.close()
tbodyTags = Soup_FamSearch.find('tbody')
trTags = tbodyTags.find_all('tr', class_='result-item ')
for trTags in trTags:
tdTags_label = trTag.find('td', class_='result-label ')
if tdTags_label:
tdTags_label_string = tdTags_label.get_text(strip=True)
if tdTags_label_string == 'Religion: ':
print trTags.find('td', class_='result-value ')
Find the Religion: label by text and get the next td sibling:
soup.find(text='Religion:').parent.find_next_sibling('td').get_text(strip=True)
Demo:
>>> import requests
>>> from bs4 import BeautifulSoup
>>>
>>> response = requests.get('https://familysearch.org/pal:/MM9.1.1/KH21-211')
>>> soup = BeautifulSoup(response.content, 'lxml')
>>>
>>> soup.find(text='Religion:').parent.find_next_sibling('td').get_text(strip=True)
Methodist
Then, you can make a nice reusable function and reuse:
def get_field_value(soup, field):
return soup.find(text='%s:' % field).parent.find_next_sibling('td').get_text(strip=True)
print get_field_value(soup, 'Religion')
print get_field_value(soup, 'Nationality')
print get_field_value(soup, 'Birthplace')
Prints:
Methodist
Canadian
Ontario