extracting multiple data from table row in BS4 - python

in the code below I am trying to extract IP addresses and ports of http://free-proxy-list.net from the table using BeautifulSoup.
But every time I get the whole row which is useless because I can't separate IP addresses from their ports.
How can I get IP and port separated?
Here is my code:
def get_proxy(self):
response = requests.get(self.url)
soup = bs(response.content,'html.parser')
data_list = [tr for tr in soup.select('tr') if tr.td]
for i in data_list:
print(i.text)

In your code,
instead of -
i.text you could use i.getText(' ,') (or with another separator of your choice other than ,).
That will give you comma separated IP and Ports.
Moreover for convenience you could load the proxy list into a dataframe as well.
Make the following changes/additions to your code -
soup = bs(response.content,'html.parser')
data_list = [tr for tr in soup.select('tr') if tr.td]
data_list2 = [tr.getText(' ,') for tr in soup.select('tr') if tr.td]
#for i in data_list:
#print(i.text)
df = pd.DataFrame(data_list2,columns=['proxy_list'])
df_proxyList= df['proxy_list'].str.split(',', expand=True)[0:300]
df_proxyList would look like (with few garbage columns) -

Try this. I had to add the isnumeric() condition to make sure that the code doesn't include the data from another table which is present on the same website.
from bs4 import BeautifulSoup as bs
import requests
from collections import defaultdict
def get_proxy(URL):
response = requests.get(url)
soup = bs(response.content,'html.parser')
mapping = defaultdict()
for tr in soup.select('tr'):
if len(list(tr)) == 8:
ip_val = str(list(tr)[0].text)
port_val = str(list(tr)[1].text)
if port_val.isnumeric():
mapping[ip_val] = port_val
for items in mapping.keys():
print("IP:",items)
print("PORT:",mapping[items])
if __name__ == '__main__':
url = "http://free-proxy-list.net"
get_proxy(url)

Related

Python: Get element next to href

Python code:
url = 'https://www.basketball-reference.com/players/'
initial = list(string.ascii_lowercase)
initial_url = [url + i for i in initial]
html_initial = [urllib.request.urlopen(i).read() for i in initial_url]
soup_initial = [BeautifulSoup(i, 'html.parser') for i in html_initial]
tags_initial = [i('a') for i in soup_initial]
print(tags_initial[0][50])
Results example:
Shareef Abdur-Rahim
From the example above, I want to extract the name of the players which is 'Shareef Abdur-Rahim', but I want to do it for all the tags_initial lists,
Does anyone have an idea?
Could you modify your post by adding your code so that we can help you better?
Maybe that could help you :
name = soup.findAll(YOUR_SELECTOR)[0].string
UPDATE
import re
import string
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.basketball-reference.com/players/'
# Alphabet
initial = list(string.ascii_lowercase)
datas = []
# URLS
urls = [url + i for i in initial]
for url in urls:
# Soup Object
soup = BeautifulSoup(urlopen(url), 'html.parser')
# Players link
url_links = soup.findAll("a", href=re.compile("players"))
for link in url_links:
# Player name
datas.append(link.string)
print("datas : ", datas)
Then, "datas" contains all the names of the players, but I advise you to do a little processing afterwards to remove some erroneous information like "..." or perhaps duplicates
There are probably better ways but I'd do it like this:
html = "a href=\"/teams/LAL/2021.html\">Los Angeles Lakers</a"
index = html.find("a href")
index = html.find(">", index) + 1
index_end = html.find("<", index)
print(html[index:index_end])
If you're using a scraper library it probably has a similar function built-in.

How to get all emails from a page individually

I am trying to get all emails from a specific page and separate them into an individual variable or even better a dictionary. This is some code.
import requests
import re
import json
from bs4 import BeautifulSoup
page = "http://www.example.net"
info = requests.get(page)
if info.status_code == 200:
print("Page accessed")
else:
print("Error accessing page")
code = info.content
soup = BeautifulSoup(code, 'lxml')
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
print(allEmails)
sep = ","
allEmailsStr = str(allEmails)
print(type(allEmails))
print(type(allEmailsStr))
j = allEmailsStr.split(sep, 1)[0]
print(j)
Excuse the poor variable names because I put this together so it would be fine by itself. The output from the example website would be for example something like
[k, kolyma, location, balkans]
So if I ran the problem it would return only
[k
But if I wanted it to return every email on there individually how would I do that?
To get just the email str you can try:
emails = []
for email_link in allEmails:
emails.append(email_link.get("href").replace('mailto:', ''))
print(emails)
Based on your expected output, you can use the unwrap function of BeautifulSoup
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
for Email in allEmails:
print(Email.unwrap()) #This will print the whole element along with tag
# k

Unable to scrape the conversation among debaters in order to put them in a dictionary

I've created a script to fetch all the conversation between different debaters excluding moderators. What I've written so far can fetch the total conversation. However, I would like to grab them like {speaker_name: (first speech, second speech) etc }.
Webpage link
another one similar to the above link
webpage link
I've tried so far:
import requests
from bs4 import BeautifulSoup
url = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas'
def get_links(link):
r = requests.get(link)
soup = BeautifulSoup(r.text,"lxml")
for item in soup.select(".field-docs-content p:has( > strong:contains('MODERATOR:')) ~ p"):
print(item.text)
if __name__ == '__main__':
get_links(url)
How can I scrape the conversation among debaters and put them in a dictionary?
I don't hold much hope for this lasting across lots of pages given the variability amongst the two pages I saw and the number of assumptions I have had to make. Essentially, I use regex on participant and moderators nodes text to isolate the lists of moderators and participants. I then loop all speech paragraphs and each time I encounter a moderator at the start of a paragraph I set a boolean variable store_paragraph = False and ignore subsequent paragraphs; likewise, each time I encounter a participant, I set store_paragraph = True and store that paragraph and subsequent ones under the appropriate participant key in my speaker_dict. I store each speaker_dict in a final results dictionary.
import requests, re
from bs4 import BeautifulSoup as bs
import pprint
links = ['https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas','https://www.presidency.ucsb.edu/documents/republican-presidential-candidates-debate-manchester-new-hampshire-0']
results = {}
p = re.compile(r'\b(\w+)\b\s+\(|\b(\w+)\b,')
with requests.Session() as s:
for number, link in enumerate(links):
r = s.get(link)
soup = bs(r.content,'lxml')
participants_tag = soup.select_one('p:has(strong:contains("PARTICIPANTS:"))')
if participants_tag.select_one('strong'):
participants_tag.strong.decompose()
speaker_dict = {i[0].upper() + ':' if i[0] else i[1].upper() + ':': [] for string in participants_tag.stripped_strings for i in p.findall(string)}
# print(speaker_dict)
moderator_data = [string for string in soup.select_one('p:has(strong:contains("MODERATOR:","MODERATORS:"))').stripped_strings][1:]
#print(moderator_data)
moderators = [i[0].upper() + ':' if i[0] else i[1].upper() + ':' for string in moderator_data for i in p.findall(string)]
store_paragraph = False
for paragraph in soup.select('.field-docs-content p:not(p:contains("PARTICIPANTS:","MODERATOR:"))')[1:]:
string_to_compare = paragraph.text.split(':')[0] + ':'
string_to_compare = string_to_compare.upper()
if string_to_compare in moderators:
store_paragraph = False
elif string_to_compare in speaker_dict:
speaker = string_to_compare
store_paragraph = True
if store_paragraph:
speaker_dict[speaker].append(paragraph.text)
results[number] = speaker_dict
pprint.pprint(results[1])

How to efficiently parse large HTML div-class and span data on Python BeautifulSoup?

The data needed:
I want to scrape through two webpages, one here: https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL and the other: https://finance.yahoo.com/quote/AAPL/financials?p=AAPL.
From the first page, I need values of the row called Total Assets. This would be 5 values in that row named: 365,725,000 375,319,000 321,686,000 290,479,000 231,839,000
Then I need 5 values of the row named Total Current Liabilities. These would be: 43,658,000 38,542,000 27,970,000 20,722,000 11,506,000
From the second link, I need 10 values of the row named Operating Income or Loss. These would be: 52,503,000 48,999,000 55,241,000 33,790,000 18,385,000.
EDIT: I need the TTM value too, and then the five years' values mentioned above. Thanks.
Here is the logic of what I want. I want to run this module, and when run, I want the output to be:
TTM array: 365725000, 116866000, 64423000
year1 array: 375319000, 100814000, 70898000
year2 array: 321686000, 79006000, 80610000
My code:
This is what I have written so far. I can extract the value within the div class if I just put it in a variable as shown below. However, how do I loop efficiently through the 'div' classes as there are thousands of them in the page. In other words, how do I find just the values I am looking for?
# Import libraries
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
# Set the URL you want to webscrape from
url = 'https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL'
# Connect to the URL
response = requests.get(url)
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
soup1 = BeautifulSoup("""<div class="D(tbc) Ta(end) Pstart(6px) Pend(4px) Bxz(bb) Py(8px) BdB Bdc($seperatorColor) Miw(90px) Miw(110px)--pnclg" data-test="fin-col"><span>321,686,000</span></div>""", "html.parser")
spup2 = BeautifulSoup("""<span data-reactid="1377">""", "html.parser");
#This works
print(soup1.find("div", class_="D(tbc) Ta(end) Pstart(6px) Pend(4px) Bxz(bb) Py(8px) BdB Bdc($seperatorColor) Miw(90px) Miw(110px)--pnclg").text)
#How to loop through all the relevant div classes?
EDIT - At the request of #Life is complex, edited to add date headings.
Try this using lxml:
import requests
from lxml import html
url = 'https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL'
url2 = 'https://finance.yahoo.com/quote/AAPL/financials?p=AAPL'
page = requests.get(url)
page2 = requests.get(url2)
tree = html.fromstring(page.content)
tree2 = html.fromstring(page2.content)
total_assets = []
Total_Current_Liabilities = []
Operating_Income_or_Loss = []
heads = []
path = '//div[#class="rw-expnded"][#data-test="fin-row"][#data-reactid]'
data_path = '../../div/span/text()'
heads_path = '//div[contains(#class,"D(ib) Fw(b) Ta(end)")]/span/text()'
dats = [tree.xpath(path),tree2.xpath(path)]
for entry in dats:
heads.append(entry[0].xpath(heads_path))
for d in entry[0]:
for s in d.xpath('//div[#title]'):
if s.attrib['title'] == 'Total Assets':
total_assets.append(s.xpath(data_path))
if s.attrib['title'] == 'Total Current Liabilities':
Total_Current_Liabilities.append(s.xpath(data_path))
if s.attrib['title'] == 'Operating Income or Loss':
Operating_Income_or_Loss.append(s.xpath(data_path))
del total_assets[0]
del Total_Current_Liabilities[0]
del Operating_Income_or_Loss[0]
print('Date Total Assets Total_Current_Liabilities:')
for date,asset,current in zip(heads[0],total_assets[0],Total_Current_Liabilities[0]):
print(date, asset, current)
print('Operating Income or Loss:')
for head,income in zip(heads[1],Operating_Income_or_Loss[0]):
print(head,income)
Output:
Date Total Assets Total_Current_Liabilities:
9/29/2018 365,725,000 116,866,000
9/29/2017 375,319,000 100,814,000
9/29/2016 321,686,000 79,006,000
Operating Income or Loss:
ttm 64,423,000
9/29/2018 70,898,000
9/29/2017 61,344,000
9/29/2016 60,024,000
Of course, if so desired, this can be easily incorporated into a pandas dataframe.
Some suggestions for parse html use 'BeautifulSoup' which is helpful for me maybe helpful for you.
use 'id' to location the element, instead of using 'class' because the 'class' change more frequently than id.
use structure info to location the element instead of using 'class', the structure info change less frequently.
use headers with user-agent info to get response is always better than no headers. In this case, if do not specify headers info, you can not find id 'Col1-1-Financials-Proxy', but you can find 'Col1-3-Financials-Proxy', which is not same with result in Chrome inspector.
Here is runnable codes for your requirement use structure info to location elements. You definitely can use 'class' info to make it. Just remember that when your code do not work well, check the website's source code.
# import libraries
import requests
from bs4 import BeautifulSoup
# set the URL you want to webscrape from
first_page_url = 'https://finance.yahoo.com/quote/AAPL/balance-sheet?p=AAPL'
second_page_url = 'https://finance.yahoo.com/quote/AAPL/financials?p=AAPL'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
#################
# first page
#################
print('*' * 10, ' FIRST PAGE RESULT ', '*' * 10)
total_assets = {}
total_current_liabilities = {}
operating_income_or_loss = {}
page1_table_keys = []
page2_table_keys = []
# connect to the first page URL
response = requests.get(first_page_url, headers=headers)
# parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
# the nearest id to get the result
sheet = soup.find(id='Col1-1-Financials-Proxy')
sheet_section_divs = sheet.section.find_all('div', recursive=False)
# last child
sheet_data_div = sheet_section_divs[-1]
div_ele_table = sheet_data_div.find('div').find('div').find_all('div', recursive=False)
# table header
div_ele_header = div_ele_table[0].find('div').find_all('div', recursive=False)
# first element is label, the remaining element containing data, so use range(1, len())
for i in range(1, len(div_ele_header)):
page1_table_keys.append(div_ele_header[i].find('span').text)
# table body
div_ele = div_ele_table[-1]
div_eles = div_ele.find_all('div', recursive=False)
tgt_div_ele1 = div_eles[0].find_all('div', recursive=False)[-1]
tgt_div_ele1_row = tgt_div_ele1.find_all('div', recursive=False)[-1]
tgt_div_ele1_row_eles = tgt_div_ele1_row.find('div').find_all('div', recursive=False)
# first element is label, the remaining element containing data, so use range(1, len())
for i in range(1, len(tgt_div_ele1_row_eles)):
total_assets[page1_table_keys[i - 1]] = tgt_div_ele1_row_eles[i].find('span').text
tgt_div_ele2 = div_eles[1].find_all('div', recursive=False)[-1]
tgt_div_ele2 = tgt_div_ele2.find('div').find_all('div', recursive=False)[-1]
tgt_div_ele2 = tgt_div_ele2.find('div').find_all('div', recursive=False)[-1]
tgt_div_ele2_row = tgt_div_ele2.find_all('div', recursive=False)[-1]
tgt_div_ele2_row_eles = tgt_div_ele2_row.find('div').find_all('div', recursive=False)
# first element is label, the remaining element containing data, so use range(1, len())
for i in range(1, len(tgt_div_ele2_row_eles)):
total_current_liabilities[page1_table_keys[i - 1]] = tgt_div_ele2_row_eles[i].find('span').text
print('Total Assets', total_assets)
print('Total Current Liabilities', total_current_liabilities)
#################
# second page, same logic as the first page
#################
print('*' * 10, ' SECOND PAGE RESULT ', '*' * 10)
# Connect to the second page URL
response = requests.get(second_page_url, headers=headers)
# Parse HTML and save to BeautifulSoup object¶
soup = BeautifulSoup(response.text, "html.parser")
# the nearest id to get the result
sheet = soup.find(id='Col1-1-Financials-Proxy')
sheet_section_divs = sheet.section.find_all('div', recursive=False)
# last child
sheet_data_div = sheet_section_divs[-1]
div_ele_table = sheet_data_div.find('div').find('div').find_all('div', recursive=False)
# table header
div_ele_header = div_ele_table[0].find('div').find_all('div', recursive=False)
# first element is label, the remaining element containing data, so use range(1, len())
for i in range(1, len(div_ele_header)):
page2_table_keys.append(div_ele_header[i].find('span').text)
# table body
div_ele = div_ele_table[-1]
div_eles = div_ele.find_all('div', recursive=False)
tgt_div_ele_row = div_eles[4]
tgt_div_ele_row_eles = tgt_div_ele_row.find('div').find_all('div', recursive=False)
for i in range(1, len(tgt_div_ele_row_eles)):
operating_income_or_loss[page2_table_keys[i - 1]] = tgt_div_ele_row_eles[i].find('span').text
print('Operating Income or Loss', operating_income_or_loss)
Output with header info:
********** FIRST PAGE RESULT **********
Total Assets {'9/29/2018': '365,725,000', '9/29/2017': '375,319,000', '9/29/2016': '321,686,000'}
Total Current Liabilities {'9/29/2018': '116,866,000', '9/29/2017': '100,814,000', '9/29/2016': '79,006,000'}
********** SECOND PAGE RESULT **********
Operating Income or Loss {'ttm': '64,423,000', '9/29/2018': '70,898,000', '9/29/2017': '61,344,000', '9/29/2016': '60,024,000'}

Scraping multiple pages with Python Beautifulsoup -- only returning data from last page

I am trying to loop through multiple pages to scrape data with Python and Beautifulsoup. My script works for one page, but when trying to iterate through multiple pages, it only returns the data from the last page scraped. I think there may be something wrong in the way I am looping or storing/appending the player_data list.
Here is what I have thus far -- any help is much appreciated.
#! python3
# downloadRecruits.py - Downloads espn college basketball recruiting database info
import requests, os, bs4, csv
import pandas as pd
# Starting url (class of 2007)
base_url = 'http://www.espn.com/college-sports/basketball/recruiting/databaseresults/_/class/2007/page/'
# Number of pages to scrape (Not inclusive, so number + 1)
pages = map(str, range(1,3))
# url for starting page
url = base_url + pages[0]
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
# Creating bs object
soup = bs4.BeautifulSoup(res.text, "html.parser")
table = soup.find('table')
# Get the data
data_rows = soup.findAll('tr')[1:]
player_data = []
for tr in data_rows:
tdata = []
for td in tr:
tdata.append(td.getText())
if td.div and td.div['class'][0] == 'school-logo':
tdata.append(td.div.a['href'])
player_data.append(tdata)
print(player_data)
You should have your player_data list definition outside your loop, otherwise only the last iteration's results will be stored.
This is an indentation issue or a declaration issue, depending on the results you expect.
If you need to print the result for each page:
You can solve this by adding 4 spaces before print(player_data).
If you let the print statement outside the for loop block, it will be executed only once, after the loop has ended. So the only values it can display are the last values of player_data leaking from the last iteration of the for loop.
if you want to store all results in player_data and print it at the end :
you must declare player_data outside and before your for loop.
player_data = []
for n in pages:
# [...]
import requests
from bs4 import BeautifulSoup
# Starting url (class of 2007)
base_url = 'http://www.espn.com/college-sports/basketball/recruiting/databaseresults/_/class/2007/page/'
# Number of pages to scrape (Not inclusive, so number + 1)
pages = list(map(str,range(1,3)))
# In Python 3, map returns an iterable object of type map, and not a subscriptible list, which would allow you to write map[i]. To force a list result, write
# url for starting page
url = base_url + pages[0]
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
# Creating bs object
soup = BeautifulSoup(res.text, "html.parser")
table = soup.find('table')
# Get the data
data_rows = soup.findAll('tr')[1:]
player_data = []
for tr in data_rows:
tdata = []
for td in tr:
tdata.append(td.getText())
if td.div and td.div['class'][0] == 'school-logo':
tdata.append(td.div.a['href'])
player_data.append(tdata)
print(player_data)

Categories

Resources