Extracting integer from the paragraph - python

i am trying to extract only the fees amount from the paragraph but i am facing the problem. there is two fees amount and i want 2 of them. this is my code for: http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx
fees_div = soup.find('div', class_='Fees hiddenContent pad-around-large tabcontent')
if fees_div:
fees_list = fees_div.find_all('\d+','p')
course_data['Fees'] = fees_list
print('fees : ', fees_list)

Try this please:
In [10]: import requests
In [11]: from bs4 import *
In [12]: page = requests.get('http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx')
In [13]: soup = BeautifulSoup(page.content, 'html.parser')
In [14]: [x for x in soup.find('div', class_='Fees hiddenContent pad-around-large tabcontent').text.split() if u"\xA3" in x]
Out[14]: ['£9,250*', '£17,320']

Give this a shot:
import re
import requests
from bs4 import BeautifulSoup
r = requests.get('http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx')
soup = BeautifulSoup(r.text,'html.parser')
item = soup.find(id='Panel5').text
fees = re.findall(r"students:[^£]+(.*?)[*\s]",item)
print(fees)
Output:
['£9,250', '£17,320']

import requests
from bs4 import BeautifulSoup
import re
r = requests.get('http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx')
soup = BeautifulSoup(r.text, 'html.parser')
fees_div = soup.find('div', class_='Fees hiddenContent pad-around-large tabcontent')
m = re.search(r'£[\d,]+', fees_div.select('p:nth-of-type(2)')[0].get_text())
fee1 = m[0]
m = re.search(r'£[\d,]+', fees_div.select('p:nth-of-type(3)')[0].get_text())
fee2 = m[0]
print(fee1, fee2)
Prints:
£9,250 £17,320
Update
You can also scrape the page using Selenium, although in this case it offers no advantage. For example (using Chhrome):
from selenium import webdriver
from bs4 import BeautifulSoup
import re
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
driver.get('http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx')
soup = BeautifulSoup(driver.page_source, 'html.parser')
fees_div = soup.find('div', class_='Fees hiddenContent pad-around-large tabcontent')
m = re.search(r'£[\d,]+', fees_div.select('p:nth-of-type(2)')[0].get_text())
fee1 = m[0]
m = re.search(r'£[\d,]+', fees_div.select('p:nth-of-type(3)')[0].get_text())
fee2 = m[0]
print(fee1, fee2)
driver.quit()
Update
Consider just using the following: Just scan the entire HTML source without using BeautifulSoup looking for the fees using a simple regular expression findall:
import requests
import re
r = requests.get('http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx')
print(re.findall(r'£[\d,]+', r.text))
Prints:
['£9,250', '£17,320']

Related

why am I getting a big chunk of html code instead of href links in the beginning and after that there is a list of links which are properly extracted

`
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
class Scraper:
def __init__(self):
self.driver = webdriver.Chrome(r'C:\Users\gkhat\Downloads\chromedriver.exe')
self.url = "http://www.carwale.com/"
self.href = []
def load_url(self):
self.driver.get(self.url + 'used/cars-for-sale/#sc=-1&so=-1&pn=1')
def scroll_down(self):
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
def read_data(self):
main = self.driver.find_element_by_xpath('/html/body/div[12]/form/section[2]/div[1]/div[4]/div[1]/div[3]/div[2]')
soup = bs(main.get_attribute("innerHTML"), "html.parser")
print(soup)
for elem in soup.findAll('h2', {'class': 'card-detail-block__title'}):
print(elem.a['href'])
self.href.append(str(elem.a['href']))`
This code
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import re
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
url = "http://www.carwale.com/used/cars-for-sale/#sc=-1&so=-1&pn=1"
driver.get(url)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
main = driver.find_element_by_xpath('/html/body/div[12]/form/section[2]/div[1]/div[4]/div[1]/div[3]/div[2]')
soup = bs(main.get_attribute("innerHTML"), "html.parser")
for elem in soup.findAll('h2', {'class': 'card-detail-block__title'}):
print(elem.a['href'])
gives this
/used/cars-in-delhi/marutisuzuki-ciaz-2017-2018-d2251253/?slot=1&rk=1&isP=true
/used/cars-in-bangalore/hyundai-i10-2010-2017-d2295085/?slot=2&rk=2&isP=true
/used/cars-in-faridabad/marutisuzuki-alto-k10-d2332673/?slot=3&rk=3&isP=true
/used/cars-in-mumbai/renault-kwid-2015-2019-d2333781/?slot=4&rk=4&isP=true
/used/cars-in-mumbai/hyundai-i10-2010-2017-d2308033/?slot=5&rk=5&isP=true
/used/cars-in-mumbai/toyota-innova-crysta-2016-2020-d2327063/?slot=6&rk=6&isP=true
/used/cars-in-kolkata/landrover-freelander-2-2009-2011-d2321851/?slot=7&rk=7&isP=true
/used/cars-in-pune/hyundai-eon-2011-2019-d2295279/?slot=8&rk=8&isP=true
/used/cars-in-navi-mumbai/renault-kwid-2015-2019-d2303909/?slot=0&rk=9&isP=false
/used/cars-in-navi-mumbai/marutisuzuki-vitara-brezza-2016-2020-d2303835/?slot=0&rk=10&isP=false
/used/cars-in-navi-mumbai/marutisuzuki-swift-2014-2018-d2332235/?slot=0&rk=11&isP=false
/used/cars-in-navi-mumbai/volkswagen-vento-2015-2019-d2285395/?slot=0&rk=12&isP=false
/used/cars-in-navi-mumbai/jeep-compass-2017-2021-d2290491/?slot=0&rk=13&isP=false
/used/cars-in-navi-mumbai/honda-wr-v-2017-2020-d2332263/?slot=0&rk=14&isP=false
/used/cars-in-navi-mumbai/renault-kwid-2015-2019-d2332249/?slot=0&rk=15&isP=false
/used/cars-in-navi-mumbai/volkswagen-polo-2012-2014-d2310489/?slot=0&rk=16&isP=false
/used/cars-in-navi-mumbai/marutisuzuki-a-star-2008-2012-d2324317/?slot=0&rk=17&isP=false
/used/cars-in-navi-mumbai/hyundai-elite-i20-2016-2017-d2281903/?slot=0&rk=18&isP=false
/used/cars-in-navi-mumbai/hyundai-i20-active-2015-2018-d2316481/?slot=0&rk=19&isP=false
/used/cars-in-navi-mumbai/toyota-innova-crysta-2016-2020-d2324333/?slot=0&rk=20&isP=false
/used/cars-in-delhi/ford-figo-2015-2019-d2241843/?slot=0&rk=21&isP=false
/used/cars-in-vadodara/bmw-x3-2011-2014-d2198006/?slot=0&rk=22&isP=false
/used/cars-in-vadodara/volkswagen-jetta20082011-d2197994/?slot=0&rk=23&isP=false
/used/cars-in-vadodara/hyundai-santafe20112014-d2227084/?slot=0&rk=24&isP=false
/used/cars-in-vadodara/skoda-octavia-2013-2015-d2227082/?slot=0&rk=25&isP=false
/used/cars-in-vadodara/bmw-5-series-2007-2010-d2227083/?slot=0&rk=26&isP=false
/used/cars-in-mumbai/honda-cr-v-2013-2018-d2309393/?slot=0&rk=27&isP=false
/used/cars-in-mumbai/bmw-3-series-2007-2009-d2309385/?slot=0&rk=28&isP=false
/used/cars-in-mumbai/skoda-rapid-2014-2015-d2306945/?slot=0&rk=29&isP=false
/used/cars-in-mumbai/audi-a6-2015-2019-d2309389/?slot=0&rk=30&isP=false
/used/cars-in-mumbai/mercedesbenz-e-class-2013-2015-d2321627/?slot=0&rk=31&isP=false
/used/cars-in-delhi/volkswagen-vento-2010-2012-d2287775/?slot=0&rk=32&isP=false
I just removed the print(soup).

webscraping bus stops with beautifulsoup

I am trying to web scrape bus stop names for a given line, here is an example page for line 212 https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212. I want to have as an output two lists, one with bus stop names in one direction and the other list with another direction. (It's clearly seen on the web page). I managed to get all names in one list with
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
print(soup.prettify())
all_bus_stops = []
table = soup.find_all('a')
for element in table:
if element.get_text() in all_bus_stops:
continue
else:
all_bus_stops.append(element.get_text())
return all_bus_stops
print(download_bus_schedule('212'))
I guess the solution would be to somehow divide the soup into two parts.
You can use the bs4.element.Tag.findAll method:
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
all_bus_stops = []
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
for s in soup.select(".holo-list"):
bus_stops = []
for f in s.findAll("li"):
if f.text not in bus_stops:
bus_stops.append(f.text)
all_bus_stops.append(bus_stops)
return all_bus_stops
print(download_bus_schedule('212'))
Output:
[['Pl.Hallera', 'Pl.Hallera', 'Darwina', 'Namysłowska', 'Rondo Żaba', 'Rogowska', 'Kołowa', 'Dks Targówek', 'Metro Targówek Mieszkaniowy', 'Myszkowska', 'Handlowa', 'Metro Trocka', 'Bieżuńska', 'Jórskiego', 'Łokietka', 'Samarytanka', 'Rolanda', 'Żuromińska', 'Targówek-Ratusz', 'Św.Wincentego', 'Malborska', 'Ch Targówek'],
['Ch Targówek', 'Ch Targówek', 'Malborska', 'Św.Wincentego', 'Targówek-Ratusz', 'Żuromińska', 'Gilarska', 'Rolanda', 'Samarytanka', 'Łokietka', 'Jórskiego', 'Bieżuńska', 'Metro Trocka', 'Metro Trocka', 'Metro Trocka', 'Handlowa', 'Myszkowska', 'Metro Targówek Mieszkaniowy', 'Dks Targówek', 'Kołowa', 'Rogowska', 'Rondo Żaba', '11 Listopada', 'Bródnowska', 'Szymanowskiego', 'Pl.Hallera', 'Pl.Hallera']]
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
bus_stops_1 = []
bus_stops_2 = []
directions = soup.find_all("ul", {"class":"holo-list"})
for stop in directions[0].find_all("a"):
if stop not in bus_stops_1:
bus_stops_1.append(stop.text.strip())
for stop in directions[1].find_all("a"):
if stop not in bus_stops_2:
bus_stops_2.append(stop.text.strip())
all_bus_stops = (bus_stops_1, bus_stops_2)
return all_bus_stops
print(download_bus_schedule('212')[0])
print(download_bus_schedule('212')[1])
I may have misunderstood as I do not know Polish but see if this helps.
from bs4 import BeautifulSoup
import requests
url = 'https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212'
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")
d = {}
for h2 in soup.select('h2.holo-divider'):
d[h2.text] = []
ul = h2.next_sibling
for li in ul.select('li'):
if li.a.text not in d[h2.text]:
d[h2.text].append(li.a.text)
from pprint import pprint
pprint(d)
As all stops are encapsulated in the next un-ordered list, you could use the find_next function of bs4.
e.g.
URL = f"http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l={bus_number}"
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
directions = ["Ch Targówek","Pl.Hallera"]
result = {}
for direction in directions:
header = soup.find(text=direction)
list = header.find_next("ul")
stops_names = [stop.get_text() for stop in list]
result[direction] = stops_names
return result
Plus you might want to use f-string to format your strings as it improves reading and is less error prone.

How to open all href within a div class?

I'm new to python and everything, and I'm looking to parse all the href within a div class. My goal is to create a program to open up all the links within the div class to be able to save the photos associated with the href.
The link: https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer
The section I want to parse is "div-id: all_nail_lacquer"
So far I'm able to get all the href and this is what I have so far:
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
print(soup.title.text)
nail_lacquer = (soup.find('div', {"id":"all_nail_lacquer"}))
"""
for nail_lacquer in soup.find_all('div'):
print(nail_lacquer.findAll('a')
"""
for a in soup.findAll('div', {"id":"all_nail_lacquer"}):
for b in a.findAll('a'):
print(b.get('href'))
To print image links (even hi-res image) and titles, you can use this script:
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
for img in soup.select('#all_nail_lacquer [typeof="foaf:Image"][data-src]'):
print(img['data-src'])
print(img['data-src'].replace('shelf_image', 'photos')) # <-- this is URL to hi-res image
print(img['title'])
print('-' * 80)
Prints:
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/baby-take-a-vow-nlsh1-nail-lacquer-22850011001_0_0.jpg?itok=3b2ftHzc
https://www.opi.com/sites/default/files/styles/product_photos/public/baby-take-a-vow-nlsh1-nail-lacquer-22850011001_0_0.jpg?itok=3b2ftHzc
Baby, Take a Vow
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/suzi-without-a-paddle-nlf88-nail-lacquer-22006698188_21_0.jpg?itok=mgi1-rz3
https://www.opi.com/sites/default/files/styles/product_photos/public/suzi-without-a-paddle-nlf88-nail-lacquer-22006698188_21_0.jpg?itok=mgi1-rz3
Suzi Without a Paddle
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/coconuts-over-opi-nlf89-nail-lacquer-22006698189_24_1_0.jpg?itok=yasOZA4l
https://www.opi.com/sites/default/files/styles/product_photos/public/coconuts-over-opi-nlf89-nail-lacquer-22006698189_24_1_0.jpg?itok=yasOZA4l
Coconuts Over OPI
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/no-tan-lines-nlf90-nail-lacquer-22006698190_20_1_0.jpg?itok=ot_cu8c5
https://www.opi.com/sites/default/files/styles/product_photos/public/no-tan-lines-nlf90-nail-lacquer-22006698190_20_1_0.jpg?itok=ot_cu8c5
No Tan Lines
--------------------------------------------------------------------------------
...and so on.
EDIT: To save images to disk, you can use this script:
import requests
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = requests.get(theurl)
soup = BeautifulSoup(thepage.content, "html.parser")
i = 1
for img in soup.select('#all_nail_lacquer [typeof="foaf:Image"][data-src]'):
u = img['data-src'].replace('shelf_image', 'photos')
with open('img_{:04d}.jpg'.format(i), 'wb') as f_out:
print('Saving {}'.format(u))
f_out.write(requests.get(u).content)
i += 1

How do I create a dataframe of jobs and companies that includes hyperlinks?

I am making a function to print a list of links so I can add them to a list of companies and job titles. However, I am having difficulties navigating tag sub-contents. I am looking to list all the 'href' in 'a' in 'div' like so:
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests
page = "https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html"
headers = {'User-Agent':'Mozilla/5.0'}
def get_soup():
session = requests.Session()
pageTree = session.get(page, headers=headers)
return BeautifulSoup(pageTree.content, 'html.parser')
pageSoup = get_soup()
def print_links():
"""this function scrapes the job title links"""
jobLink = [div.a for div in pageSoup.find_all('div', class_='title')]
for div in jobLink:
print(div['href'])
I am trying to make a list but my result is simply text and does not seem to be a link like so:
/pagead/clk?mo=r&ad=-6NYlbfkN0DhVAxkc_TxySVbUOs6bxWYWOfhmDTNcVTjFFBAY1FXZ2RjSBnfHw4gS8ZdlOOq-xx2DHOyKEivyG9C4fWOSDdPgVbQFdESBaF5zEV59bYpeWJ9R8nSuJEszmv8ERYVwxWiRnVrVe6sJXmDYTevCgexdm0WsnEsGomjLSDeJsGsHFLAkovPur-rE7pCorqQMUeSz8p08N_WY8kARDzUa4tPOVSr0rQf5czrxiJ9OU0pwQBfCHLDDGoyUdvhtXy8RlOH7lu3WEU71VtjxbT1vPHPbOZ1DdjkMhhhxq_DptjQdUk_QKcge3Ao7S3VVmPrvkpK0uFlA0tm3f4AuVawEAp4cOUH6jfWSBiGH7G66-bi8UHYIQm1UIiCU48Yd_pe24hfwv5Hc4Gj9QRAAr8ZBytYGa5U8z-2hrv2GaHe8I0wWBaFn_m_J10ikxFbh6splYGOOTfKnoLyt2LcUis-kRGecfvtGd1b8hWz7-xYrYkbvs5fdUJP_hDAFGIdnZHVJUitlhjgKyYDIDMJ-QL4aPUA-QPu-KTB3EKdHqCgQUWvQud4JC2Fd8VXDKig6mQcmHhZEed-6qjx5PYoSifi5wtRDyoSpkkBx39UO3F918tybwIbYQ2TSmgCHzGm32J4Ny7zPt8MPxowRw==&p=0&fvj=1&vjs=3
Additionally, here is my attempt at making a list with the links:
def get_job_titles():
"""this function scrapes the job titles"""
jobs = []
jobTitle = pageSoup.find_all('div', class_='title')
for span in jobTitle:
link = span.find('href')
if link:
jobs.append({'title':link.text,
'href':link.attrs['href']})
else:
jobs.append({'title':span.text, 'href':None})
return jobs
I would regex out from html returned the required info and construct the url from the parameters the page javascript uses to dynamically construct each url. Interestingly, the total number of listings is different when using requests than using browser. You can manually enter the number of listings e.g. 6175 (currently) or use the number returned by the request (which is lower and you miss some results). You could also use selenium to get the correct initial result count). You can then issue requests with offsets to get all listings.
Listings can be randomized in terms of ordering.
It seems you can introduce a limit parameter to increase results_per_page up to 50 e.g.
https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&limit=50&start=0
Furthermore, it seems that it is possible to retrieve more results that are actually given as the total results count on webpage.
py with 10 per page:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_listings = int(soup.select_one('[name=description]')['content'].split(' ')[0].replace(',',''))
#number_of_pages = math.ceil(number_of_listings/listings_per_page)
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )
If you want to use selenium to get correct initial listings count:
import requests, re, hjson, math
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
d = webdriver.Chrome(r'C:\Users\HarrisQ\Documents\chromedriver.exe', options = options)
d.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
number_of_listings = int(d.find_element_by_css_selector('[name=description]').get_attribute('content').split(' ')[0].replace(',',''))
d.quit()
p = re.compile(r"jobmap\[\d+\]= ({.*?})")
p1 = re.compile(r"var searchUID = '(.*?)';")
counter = 0
final = {}
with requests.Session() as s:
r = s.get('https://www.indeed.com/q-software-developer-l-San-Francisco-jobs.html#')
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
listings_per_page = 10
number_of_pages = math.ceil(6175/listings_per_page) #manually calculated
for page in range(1, number_of_pages + 1):
if page > 1:
r = s.get('https://www.indeed.com/jobs?q=software+developer&l=San+Francisco&start={}'.format(10*page-1))
soup = bs(r.content, 'lxml')
tk = p1.findall(r.text)[0]
for item in p.findall(r.text):
data = hjson.loads(item)
jk = data['jk']
row = {'title' : data['title']
,'company' : data['cmp']
,'url' : f'https://www.indeed.com/viewjob?jk={jk}&tk={tk}&from=serp&vjs=3'
}
final[counter] = row
counter+=1
df = pd.DataFrame(final)
output_df = df.T
output_df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )

How to Get Text between Span Tags XPATH Python

I am working with this website https://www.pealim.com/dict/?page=1. So I basically want to get the hebrew word and its pronunciation.
Below is my code and it loops through all the td tags however, it produces the exact same output which is the following {'latin': 'av', 'hebrew': u'\u05d0\u05b8\u05d1'} And this code is for only page=1. I would love to know if there is any automated way to loop through every page.
import requests
from lxml import etree
resp = requests.get("https://www.pealim.com/dict/?page=1")
htmlparser = etree.HTMLParser()
tree = etree.fromstring(resp.text, htmlparser)
for td in tree.xpath('//*//table[#class="table table-hover dict-table-t"]/tbody/tr'):
print(td)
data = {
'hebrew': td.xpath('string(//span[#class="menukad"])'),
'latin': td.xpath('string(//span[#class="dict-transcription"])'),
}
print(data)
I would like to collect information for every single entry in that website. Please let me know what I can do to achieve this.
import requests
from bs4 import BeautifulSoup
from pprint import pprint
for i in range(1, 411):
data = []
resp = requests.get("https://www.pealim.com/dict/?page={}".format(i))
soup = BeautifulSoup(resp.text, 'lxml')
for m, t in zip(soup.select('.menukad'), soup.select('.dict-transcription')):
data.append((m.text, t.text))
print('PAGE {}'.format(i))
print('*' * 80)
pprint(data)
Prints:
PAGE 1
********************************************************************************
[('אָב', 'av'),
('אַבָּא', 'aba'),
('אָבִיב', 'aviv'),
('אֵב', 'ev'),
('לֶאֱבוֹד', "le'evod"),
('לְהֵיאָבֵד', "lehe'aved"),
('לְאַבֵּד', "le'abed"),
('לְהִתְאַבֵּד', "lehit'abed"),
('לְהַאֲבִיד', "leha'avid"),
('הִתְאַבְּדוּת', "hit'abdut"),
('אִיבּוּד', 'ibud'),
('אֲבֵדָה', 'aveda'),
('אָבוּד', 'avud'),
('לְאַבְחֵן', "le'avchen"),
('אִיבְחוּן', 'ivchun')]
PAGE 2
********************************************************************************
[('לְאַבְטֵחַ', "le'avteach"),
('אִיבְטוּחַ', 'ivtuach'),
('אֲבַטִּיחַ', 'avatiach'),
('לֶאֱבוֹת', "le'evot"),
('אֵבֶל', 'evel'),
('לֶאֱבוֹל', "le'evol"),
('אֲבָל', 'aval'),
('לְהִתְאַבֵּל', "lehit'abel"),
('לְהִתְאַבֵּן', "lehit'aben"),
('אֶבֶן', 'even'),
('לְהַאֲבִיס', "leha'avis"),
('לְהֵיאָבֵק', "lehe'avek"),
('מַאֲבָק', "ma'avak"),
('לְאַבֵּק', "le'abek"),
('אָבָק', 'avak')]
PAGE 3
********************************************************************************
[('לְהִתְאַבֵּק', "lehit'abek"),
('לְהִתְאַבֵּק', "lehit'abek"),
('מְאוּבָּק', "me'ubak"),
('אִיבּוּק', 'ibuk'),
...and so on.
Andrej beat me to it, but alternatively you can use .find() and .get_text() methods of BeautifulSoup:
import bs4
import requests
for page_number in range(1, 411):
print("-" * 35, page_number, "-" * 35)
resp = requests.get("https://www.pealim.com/dict/?page={}".format(page_number))
soup = bs4.BeautifulSoup(resp.text, "html.parser")
table_elem = soup.find("tbody")
rows = table_elem.find_all("tr")
for row in rows:
hebrew = row.find("span", class_="menukad").get_text()
latin = row.find("span", class_="dict-transcription").get_text()
print("{}: {}".format(hebrew, latin))
To yield essentially the same result.

Categories

Resources