from bs4 import BeautifulSoup
import requests
url="https://bararanonline.com/letter/%D5%A1?page=1"
response=requests.get(url)
soup=BeautifulSoup(response.content, "lxml")
words=soup.find_all('a',"word-href")
for word in words:
print(word.text)
So, I got the first page. Now, I want to scrape information from all pages and I have to put URL page number in {} (page={}), but I can't figure out how to do it.
Thanks in advance.
Simply define a for loop and set your range() parameters:
from bs4 import BeautifulSoup
import requests
url="https://bararanonline.com/letter/%D5%A1?page="
words = []
for i in range(1, 3):
response=requests.get(f'{url}{i}')
##or as olvin roght mentioned by setting params
##response=requests.get("https://bararanonline.com/letter/ա", params={"page": i})
soup=BeautifulSoup(response.content, "lxml")
words.extend([word.text.strip() for word in soup.find_all('a',"word-href")])
words
Alternativ is to go by while - Example starts with 207 just to show, that it stops, if there is no next page, but you can change it if you like:
from bs4 import BeautifulSoup
import requests
url="https://bararanonline.com/letter/%D5%A1?page=207"
words = []
while True:
response=requests.get(url)
soup=BeautifulSoup(response.content, "lxml")
words.extend([word.text.strip() for word in soup.find_all('a',"word-href")])
if a := soup.select_one('a[rel="next"]'):
url = a['href']
else:
break
words
Related
I am writing a script that will scrape a newsletter for URLs. There are some URLs in the newsletter that are irrelevant (e.g. links to articles, mailto links, social links, etc.). I added some logic to remove those links, but for some reason not all of them are being removed. Here is my code:
from os import remove
from turtle import clear
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
termSheetLinks = []
for companyURL in htmlParser.select("table#templateBody p > a"):
termSheetLinks.append(companyURL.get('href'))
for link in termSheetLinks:
if "fortune.com" in link in termSheetLinks:
termSheetLinks.remove(link)
if "forbes.com" in link in termSheetLinks:
termSheetLinks.remove(link)
if "twitter.com" in link in termSheetLinks:
termSheetLinks.remove(link)
print(termSheetLinks)
When I ran it most recently, this was my output, despite trying to remove all links containing "fortune.com":
['https://fortune.com/company/blackstone-group?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://fortune.com/company/tpg?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://casproviders.org/asd-guidelines/', 'https://fortune.com/company/carlyle-group?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://ir.carlyle.com/static-files/433abb19-8207-4632-b173-9606698642e5', 'mailto:termsheet#fortune.com', 'https://www.afresh.com/', 'https://www.geopagos.com/', 'https://montana-renewables.com/', 'https://descarteslabs.com/', 'https://www.dealer-pay.com/', 'https://www.sequeldm.com/', 'https://pueblo-mechanical.com/', 'https://dealcloud.com/future-proof-your-firm/', 'https://apartmentdata.com/', 'https://www.irobot.com/', 'https://www.martin-bencher.com/', 'https://cell-matters.com/', 'https://www.lever.co/', 'https://www.sigulerguff.com/']
Any help would be greatly appreciated!
It do not need a regex in my opinion - Instead of removing the urls, append only those to a list that do not contain your substrings, eg with a list comprehension:
[companyURL.get('href') for companyURL in htmlParser.select("table#templateBody p > a") if not any(x in companyURL.get('href') for x in ["fortune.com","forbes.com","twitter.com"])]
Example
from bs4 import BeautifulSoup
import requests
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
myList = ["fortune.com","forbes.com","twitter.com"]
[companyURL.get('href') for companyURL in htmlParser.select("table#templateBody p > a")
if not any(x in companyURL.get('href') for x in myList)]
Output
['https://casproviders.org/asd-guidelines/',
'https://ir.carlyle.com/static-files/433abb19-8207-4632-b173-9606698642e5',
'https://www.afresh.com/',
'https://www.geopagos.com/',
'https://montana-renewables.com/',
'https://descarteslabs.com/',
'https://www.dealer-pay.com/',
'https://www.sequeldm.com/',
'https://pueblo-mechanical.com/',
'https://dealcloud.com/future-proof-your-firm/',
'https://apartmentdata.com/',
'https://www.irobot.com/',
'https://www.martin-bencher.com/',
'https://cell-matters.com/',
'https://www.lever.co/',
'https://www.sigulerguff.com/']
Removing the links after the for iterator will not skip any entry.
from os import remove
from turtle import clear
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
termSheetLinks = []
for companyURL in htmlParser.select("table#templateBody p > a"):
termSheetLinks.append(companyURL.get('href'))
lRemove = []
for link in termSheetLinks:
if "fortune.com" in link:
lRemove.append(link)
if "forbes.com" in link:
lRemove.append(link)
if "twitter.com" in link:
lRemove.append(link)
for l in lRemove:
termSheetLinks.remove(l)
print(termSheetLinks)
import mechanize
from bs4 import BeautifulSoup
import time
import smtplib
True by default
while True:
url = "https://www.google.com"
browser = mechanize.Browser()
browser.open(url)
response = browser.response().read()
soup = BeautifulSoup(response, "lxml")
count = 1
if str(soup).find("English") == -1:
# wait 60 seconds (change the time(in seconds) as you wish),
print('Checking - ' + str(count) + 'th Time')
time.sleep(60)
count += 1
# continue with the script
continue
There is a couple of problems here:
Beautiful soup provide a method get_text() to extract the text, so you do not need to convert it to string.
String's find() return -1 when no value was found. Are you sure that is what you want?
Why do you use time.sleep()? What is the purpose of stopping the program?
You did not create a loop, which make count redundant and you will get error for continue.
If you want to get the number of occurence of a string, you can use regex's findall() and then get its length like: len(re.findall("English", soup_text)).
If you want to find multiple keywords, you can create a list of the keywords and then loop through them like:
for k in ["a", "b", "c"]:
print(f'{k}: {len(re.findall(k, soup.get_text()))}')
Full example:
from bs4 import BeautifulSoup
import requests # simple http request
import re # regex
url = "https://www.google.com"
doc = requests.get(url)
soup = BeautifulSoup(doc.text, "lxml")
soup_text = soup.get_text()
keywords = ["Google", "English", "a"]
for k in keywords:
print(f'{k}: {len(re.findall(k, soup_text))}')
You are strongly suggested to study python thoroughly:
Python: w3school tutorial
BeautifulSoup: Documentation
Regex: w3schools tutorial or RegExr
I have build a list which contains href from website and i wanna randomly select one of this link, how can i do that?
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles = a['href']
x = random.choice(articles)
print(x)
That code work, but selecting only random index from all of the objects
You're very close to the answer. You just need to do this:
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles.append(a['href'])
x = random.choice(articles)
print(x)
getItems()
The changes are:
We add each article to the articles array.
The random choice is now done after the loop, rather than inside the loop.
I try to scam URL link form google. Users can input any search then they can take a URL link. but here is the main problem is this split function can't work. I can't fix it. So please help me
[[Suppose: Any user can input "all useless website" that time google can showing us a result. User can take only URL link.]]
from typing import re
from bs4 import BeautifulSoup
import requests
user_input = input('Enter value for search : ')
print('Please Wait')
page_source = requests.get("https://www.google.com/search?q=" + user_input)
soup = BeautifulSoup(page_source.text, 'html.parser')
print(soup.title)
print(soup.title.string)
print(soup.title.parent.name)
all_links = soup.find_all('a')
for link in all_links:
link_google = re.split(":(?=http)", link["href"].replace("/url?q=", ""))
print(link_google.find["a"])
You're importing re from the wrong place. You need to use it via import re, as follows:
import re
...
link_google = re.split(":(?=http)", link["href"].replace("/url?q=", ""))
Update to make your code work:
import re correctly
fix this line from all_links = soup.find_all('a') to all_links = soup.find_all('a', href=True)
Take the link and clean it up like you did before (re.split() works perfectly but it returns a list) and add that link to a list (unpack the list) or print it
Here is the code updated to make it work
# issue 1
import re
from bs4 import BeautifulSoup
import requests
user_input = input('Enter value for search : ')
print('Please Wait')
page_source = requests.get("https://www.google.com/search?q=" + user_input)
soup = BeautifulSoup(page_source.text, 'html.parser')
print(soup.title)
print(soup.title.string)
print(soup.title.parent.name)
# issue 2
all_links = soup.find_all('a', href=True)
for link in all_links:
link_from_google = re.split(":(?=http)", link["href"].replace("/url?q=", ""))
# issue 3
print(link_from_google[0])
>>> {returns all the http links}
One liner list comprehension for fun
google_links = [re.split(":(?=http)", link["href"].replace("/url?q=", ""))[0] for link in soup.find_all('a', href=True)]
>>> {returns all the http links}
This is a homework assignment, I have to sum up all the data in the span tag and print it, I took out all the info in the span tag and appended it to a list, I dont know how to go beyond that as any funtion I type out of the for loop does not function, also I have to hit enter twice after I run this in the python command prompt to get an output
I am new here, please forgive the format of the question, thanks for the help
import urllib.request, urllib.parse, urllib.error
lst = list()
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
spans = soup.findAll('span', attrs = {'class' : 'comments'})
for span in spans:
num = int(span.text)
lst.append(num)
print(num)
No need to collect to list or anything if you just adding them together. You can do it like this:
import urllib.request, urllib.parse, urllib.error
lst = list()
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
spans = soup.findAll('span', attrs = {'class' : 'comments'})
sum = 0
for span in spans:
sum += int(span.text)
print("Total Sum = " + str(sum))
As you have prepared the list of numbers in span, you can have the sum of numbers using sum() function in python. Pass your list as argument of sum().
import urllib.request, urllib.parse, urllib.error
lst = list()
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
spans = soup.findAll('span', attrs = {'class' : 'comments'})
for span in spans:
num = int(span.text)
lst.append(num)
print(num)
sum_of_nums = sum(lst)
print(sum_of_nums)