How i can get random object from list in Python - python

I have build a list which contains href from website and i wanna randomly select one of this link, how can i do that?
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles = a['href']
x = random.choice(articles)
print(x)
That code work, but selecting only random index from all of the objects

You're very close to the answer. You just need to do this:
from bs4 import BeautifulSoup
import urllib
import requests
import re
import random
url = "https://www.formula1.com/en/latest.html"
articles = []
respone = urllib.request.urlopen(url)
soup = BeautifulSoup(respone,'lxml')
def getItems():
for a in soup.findAll('a',attrs={'href': re.compile("/en/latest/article.")}):
articles.append(a['href'])
x = random.choice(articles)
print(x)
getItems()
The changes are:
We add each article to the articles array.
The random choice is now done after the loop, rather than inside the loop.

Related

Python REGEX remove string containing substring

I am writing a script that will scrape a newsletter for URLs. There are some URLs in the newsletter that are irrelevant (e.g. links to articles, mailto links, social links, etc.). I added some logic to remove those links, but for some reason not all of them are being removed. Here is my code:
from os import remove
from turtle import clear
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
termSheetLinks = []
for companyURL in htmlParser.select("table#templateBody p > a"):
termSheetLinks.append(companyURL.get('href'))
for link in termSheetLinks:
if "fortune.com" in link in termSheetLinks:
termSheetLinks.remove(link)
if "forbes.com" in link in termSheetLinks:
termSheetLinks.remove(link)
if "twitter.com" in link in termSheetLinks:
termSheetLinks.remove(link)
print(termSheetLinks)
When I ran it most recently, this was my output, despite trying to remove all links containing "fortune.com":
['https://fortune.com/company/blackstone-group?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://fortune.com/company/tpg?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://casproviders.org/asd-guidelines/', 'https://fortune.com/company/carlyle-group?utm_source=email&utm_medium=newsletter&utm_campaign=term-sheet&utm_content=2022080907am', 'https://ir.carlyle.com/static-files/433abb19-8207-4632-b173-9606698642e5', 'mailto:termsheet#fortune.com', 'https://www.afresh.com/', 'https://www.geopagos.com/', 'https://montana-renewables.com/', 'https://descarteslabs.com/', 'https://www.dealer-pay.com/', 'https://www.sequeldm.com/', 'https://pueblo-mechanical.com/', 'https://dealcloud.com/future-proof-your-firm/', 'https://apartmentdata.com/', 'https://www.irobot.com/', 'https://www.martin-bencher.com/', 'https://cell-matters.com/', 'https://www.lever.co/', 'https://www.sigulerguff.com/']
Any help would be greatly appreciated!
It do not need a regex in my opinion - Instead of removing the urls, append only those to a list that do not contain your substrings, eg with a list comprehension:
[companyURL.get('href') for companyURL in htmlParser.select("table#templateBody p > a") if not any(x in companyURL.get('href') for x in ["fortune.com","forbes.com","twitter.com"])]
Example
from bs4 import BeautifulSoup
import requests
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
myList = ["fortune.com","forbes.com","twitter.com"]
[companyURL.get('href') for companyURL in htmlParser.select("table#templateBody p > a")
if not any(x in companyURL.get('href') for x in myList)]
Output
['https://casproviders.org/asd-guidelines/',
'https://ir.carlyle.com/static-files/433abb19-8207-4632-b173-9606698642e5',
'https://www.afresh.com/',
'https://www.geopagos.com/',
'https://montana-renewables.com/',
'https://descarteslabs.com/',
'https://www.dealer-pay.com/',
'https://www.sequeldm.com/',
'https://pueblo-mechanical.com/',
'https://dealcloud.com/future-proof-your-firm/',
'https://apartmentdata.com/',
'https://www.irobot.com/',
'https://www.martin-bencher.com/',
'https://cell-matters.com/',
'https://www.lever.co/',
'https://www.sigulerguff.com/']
Removing the links after the for iterator will not skip any entry.
from os import remove
from turtle import clear
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
termSheet = "https://fortune.com/newsletter/termsheet"
html = requests.get(termSheet)
htmlParser = BeautifulSoup(html.text, "html.parser")
termSheetLinks = []
for companyURL in htmlParser.select("table#templateBody p > a"):
termSheetLinks.append(companyURL.get('href'))
lRemove = []
for link in termSheetLinks:
if "fortune.com" in link:
lRemove.append(link)
if "forbes.com" in link:
lRemove.append(link)
if "twitter.com" in link:
lRemove.append(link)
for l in lRemove:
termSheetLinks.remove(l)
print(termSheetLinks)

create list of tuples with download url + "href"

I'm trying to make a list of tuples, the first element being the download URL and the second being the file name from the URL string with below code:
import urllib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import io
url = r"https://www.ers.usda.gov/data-products/livestock-meat-domestic-data"
my_bytes = urllib.request.urlopen(url)
my_bytes = my_bytes.read().decode("utf8")
parsed_html = BeautifulSoup(my_bytes, features = "lxml")
table_data = parsed_html.body.find('table', attrs = {'id':'data_table'})
download_url = "https://www.ers.usda.gov"
full_download_url = [tuple(download_url,i["href"]) for i in table_data.find_all('a')]
But I've been getting TypeError: must be str, not list all along and I'm not sure how to fix this, please help? Thanks!
This is what I needed:
import urllib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import io
url = r"https://www.ers.usda.gov/data-products/livestock-meat-domestic-data"
my_bytes = urllib.request.urlopen(url)
my_bytes = my_bytes.read().decode("utf8")
parsed_html = BeautifulSoup(my_bytes, features = "lxml")
table_data = parsed_html.body.find('table', attrs = {'id':'data_table'})
download_url = "https://www.ers.usda.gov"
def convertTuple(tup):
str = ''
for item in tup:
str = str + item
return str
full_download_url = [convertTuple(tuple(download_url + i["href"])) for i in table_data.find_all('a')]
Thanks to Geeks for geeks and everyone trying to help :)
You are incorrectly accessing the download_url array index.
Python is interpreting your code as creating an array with one element [0] when i is
0 for example, and then trying to access the element ["href"] which is a string, not a valid index
If you specify download_url before accessing the indices it will work as expected
full_download_url = [(download_url, download_url[i]["href"]) for i in table_data.find_all('a')]

how to create range for pages (from 1 page to x)

from bs4 import BeautifulSoup
import requests
url="https://bararanonline.com/letter/%D5%A1?page=1"
response=requests.get(url)
soup=BeautifulSoup(response.content, "lxml")
words=soup.find_all('a',"word-href")
for word in words:
print(word.text)
So, I got the first page. Now, I want to scrape information from all pages and I have to put URL page number in {} (page={}), but I can't figure out how to do it.
Thanks in advance.
Simply define a for loop and set your range() parameters:
from bs4 import BeautifulSoup
import requests
url="https://bararanonline.com/letter/%D5%A1?page="
words = []
for i in range(1, 3):
response=requests.get(f'{url}{i}')
##or as olvin roght mentioned by setting params
##response=requests.get("https://bararanonline.com/letter/ա", params={"page": i})
soup=BeautifulSoup(response.content, "lxml")
words.extend([word.text.strip() for word in soup.find_all('a',"word-href")])
words
Alternativ is to go by while - Example starts with 207 just to show, that it stops, if there is no next page, but you can change it if you like:
from bs4 import BeautifulSoup
import requests
url="https://bararanonline.com/letter/%D5%A1?page=207"
words = []
while True:
response=requests.get(url)
soup=BeautifulSoup(response.content, "lxml")
words.extend([word.text.strip() for word in soup.find_all('a',"word-href")])
if a := soup.select_one('a[rel="next"]'):
url = a['href']
else:
break
words

Get data of a XML with criteria

i have the following code:
import pandas as pd
import urllib.parse
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
url = 'http://windte2001.acepta.com/v01/E67EBB4910CFDCB067EB7D85FBA6E5511D0E64A9'.replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
result = soup.find_all(['NmbItem','QtyItem'])
that brings the next result in xml
<NmbItem>SERV. MANEJO DE LIQUIDOS</NmbItem>, <QtyItem>22.00</QtyItem>, <NmbItem>SERV. MANEJO DE RESPEL</NmbItem>, <QtyItem>1.00</QtyItem>]
All i need if NmbItem contains 'LIQUIDOS' bring me the 'QtyItem' in this case is 22
How can i do this with python in this xml?
You can use regular expression.
import re
from bs4 import BeautifulSoup
soup=BeautifulSoup(new,'xml')
result=soup.find('NmbItem',text=re.compile("LIQUIDOS")).find_next('QtyItem').text
print(result)
You can do like this:
result = soup.find_all(['NmbItem'])
for item in result:
if 'LIQUIDOS' in item.text:
print(list(item.next_siblings)[3].text)

web scraping For loop stuck on first item on list- python

I'm trying to loop through a list of names, it seems it should be very simple but it's not working. Only receiving the first name on the list and nothing more! why is it not moving on to the NEXT name!?
Here's my code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
page = requests.get('http://www.aus.edu/info/200170/college_of_architecture_art_and_design/269/department_of_architecture/4')
soup = BeautifulSoup (page.content, "html.parser")
content = soup.find(class_="supContact")
content_items = content.find_all(class_="contactToggle selected")
names = content_items[0]
s_name = []
for item in name:
s_name.append(content.find(class_="contactToggle selected").text)
if not names:
continue
print(s_name)
#print(names.prettify())

Categories

Resources