Python scraping href adjust url - python

This code works for the URL http://www.schulliste.eu/schule/ but not for
http://www.schulliste.eu/type/gymnasien/
Does anybody know why? I think it has something to do with the keyword "title"
Also I like to have the plain email adresses (without brackets and quotes) among themselves, is that possible?
import requests
from bs4 import BeautifulSoup as soup
def get_emails(_links: list, _r=[0, 10]):
for i in range(*_r):
new_d = soup(requests.get(_links[i]).text, 'html.parser').find_all('a', {'class':'my_modal_open'})
if new_d:
yield new_d[-1]['title']
d = soup(requests.get('http://www.schulliste.eu/schule/').text, 'html.parser')
results = [i['href'] for i in d.find_all('a')][52:-9]
print(list(get_emails(results)))

I guess that it does not work, b/c searched item 'a', {'class':'my_modal_open'} is not found by the second link.
To print it without quotes, you could try this:
items = list(get_emails(results))
for item in items:
print(item)

Related

Change scraped output

I have a loop putting URLs into my broswer and scraping its content, generating this output:
2PRACE,0.0014
Hispanic,0.1556
API,0.0688
Black,0.0510
AIAN,0.0031
White,0.7200
The code looks like this:
f1 = open('urlz.txt','r',encoding="utf8")
ethnicity_urls = f1.readlines()
f1.close()
from urllib import request
from bs4 import BeautifulSoup
import time
import openpyxl
import pprint
for each in ethnicity_urls:
time.sleep(1)
scraped = request.urlopen(each)
soup = BeautifulSoup(scraped)
soup1 = soup.select('p')
print(soup1)
resultFile = open('results.csv','a')
resultFile.write(pprint.pformat(soup1))
resultFile.close()
My problem is quite simple yet I do not find any tool that helps me achieve it. I would like to change the output from a list with "\n" in it to this:
2PRACE,0.0014 Hispanic,0.1556 API,0.0688 Black,0.0510 AIAN,0.0031 White,0.7200
I did not succeed by using replace as it told me I am treating a number of elements the same as a single element.
My approach here was:
for each in ethnicity_urls:
time.sleep(1)
scraped = request.urlopen(each)
soup = BeautifulSoup(scraped)
soup1 = soup.select('p')
soup2 = soup1.replace('\n',' ')
print(soup2)
resultFile = open('results.csv','a')
resultFile.write(pprint.pformat(soup2))
resultFile.close()
Can you help me find the correct approach to mutate the output before writing it to a csv?
The error message I get:
AttributeError: ResultSet object has no attribute 'replace'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
See the solution to the problem in my answer below. Thanks for all the responses!
soup1 seems to be an iterable, so you cannot just call replace on it.
Instead you could loop through all string items in soup1 and then call replace for every single one of them and then save the changes string to your soup2 variable. Something like this:
for e in soup1:
soup2.append(e.replace('\n',' '))
You need to iterate over the soup.
Soup is a list of elements:
The BS4 Documentation is excellent and has many many examples:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
Use strip() to remove the \n
for x in soup1:
for r in x.children:
try:
print(r.strip())
except TypeError:
pass
Thank you both for the ideas and resources. I think I could implement what you suggested. The current build is
for each in ethnicity_urls:
time.sleep(1)
scraped = request.urlopen(each)
soup = BeautifulSoup(scraped)
soup1 = soup.select('p')
for e in soup1:
soup2 = str(soup1)
soup2 = soup2.replace('\n','')
print(soup2)
resultFile = open('results.csv','a')
resultFile.write(pprint.pformat(soup2))
resultFile.close()
And works just fine. I can do the final adjustments now in excel.

Removing quotes from re.findall output

I am trying to remove the quotes from my re.findall output using Python 3. I tried suggestions from various forums but it didn't work as expected finally thought of asking out here myself.
My code:
import requests
from bs4 import BeautifulSoup
import re
import time
price = [];
while True:
url = "https://api.binance.com/api/v3/ticker/price?symbol=ETHUSDT"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
data = soup.prettify()
for p in data:
match = re.findall('\d*\.?\d+',data)
print("ETH/USDT",match)
price.append(match)
break
Output of match gives:
['143.19000000']. I would like it to be like: [143.1900000] but I cannot figure out how to do this.
Another problem I am encountering is that the list price appends every object like a single list. So the output of price would be for example [[a], [b], [c]]. I would like it to be like [a, b, c] I am having a bit of trouble to solve these two problems.
Thanks :)
Parse the response from requests.get() as JSON, rather than using BeautifulSoup:
import requests
url = "https://api.binance.com/api/v3/ticker/price?symbol=ETHUSDT"
response = requests.get(url)
response.raise_for_status()
data = response.json()
print(data["price"])
To get floats instead of strings:
float_match = [float(el) for el in match]
To get a list instead of a list of lists:
for el in float_match:
price.append(el)

Python scraping delete duplicates

I dont wanna have a email address twice, with this code I get the error
TypeError: unhashable type: 'list'
So i assume that the line
allLinks= set()
is wrong and I have to use a tuple and not a list, is that right?
Thats my code:
import requests
from bs4 import BeautifulSoup as soup
def get_emails(_links:list):
for i in range(len(_links)):
new_d = soup(requests.get(_links[i]).text, 'html.parser').find_all('a', {'class':'my_modal_open'})
if new_d:
yield new_d[-1]['title']
start = 20
while True:
d = soup(requests.get('http://www.schulliste.eu/type/gymnasien/?bundesland=&start={page_id}'.format(page_id=start)).text, 'html.parser')
results = [i['href'] for i in d.find_all('a')][52:-9]
results = [link for link in results if link.startswith('http://')]
next_page=d.find('div', {'class': 'paging'}, 'weiter')
if next_page:
start+=20
else:
break
allLinks= set()
if results not in allLinks:
print(list(get_emails(results)))
allLinks.add(results)
You're trying to add an entire list of emails as a single entry in the set.
What you want is to add the actual emails each one in a separate set entry.
The problem is in this line:
allLinks.add(results)
It adds the entire results list as a single element in the set and that doesn't work. Use this instead:
allLinks.update(results)
This will update the set with elements from the list, but each element will be a separate entry in the set.
I got it working but I still get duplicate emails.
allLinks = []
if results not in allLinks:
print(list(get_emails(results)))
allLinks.append((results))
Does anybody know why?

BeautifulSoup find a key value inside a code snippet inside a tag

My goal is to obtain the value for the 'sitekey' from a page source. The snippet of the code is here. The page in question is this
Right now, doing
soup = BeautifulSoup(url,'html.parser')
soup.find('div',{"class":"field field--required"})
does not work since there are multiple div tags with the same class name. How would I solve this issue?
Thank you in advance.
Edit:
def sitekey_search(atc_link):
response = session.get(atc_link)
soup = BeautifulSoup(response.content, 'html.parser')
sitekey = soup.select("div script")[0]
print(sitekey)
m = re.match("""\"(\w+)\"""", sitekey)
if m:
print(m.groups())
You can use:
soup.select("div.field.field-required")
it will give you a list with the divs found.
soup = BeautifulSoup(a,'lxml')
sitekey = soup.select("div script")[0]
b = sitekey.text
print(re.findall(r'"([^"]*)"', b))
This should do the job, the variable a [1st line] is the input (html),
b is only the script part and the regular expression prints everything in between quotes, in this case, the key, you can use additionally.strip("'") if you want to remove the quotes from the key or replace("'","")

How can I group it by using "search" function in regular expression?

I have been developing a python web-crawler to collect the used car stock data from this website. (http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page=20)
First of all, I would like to collect only "BMW" from the list. So, I used "search" function in regular expression like the code below. But, it keeps returning "None".
Is there anything wrong in my code?
Please give me some advice.
Thanks.
from bs4 import BeautifulSoup
import urllib.request
import re
CAR_PAGE_TEMPLATE = "http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page="
def fetch_post_list():
for i in range(20,21):
URL = CAR_PAGE_TEMPLATE + str(i)
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='cyber')
print ("Page#", i)
# 50 lists per each page
lists=table.find_all('tr', itemtype="http://schema.org/Article")
count=0
r=re.compile("[BMW]")
for lst in lists:
if lst.find_all('td')[3].find('em').text:
lst_price=lst.find_all('td')[3].find('em').text
lst_title=lst.find_all('td')[1].find('a').text
lst_link = lst.find_all('td')[1].find('a')['href']
lst_photo_url=''
if lst.find_all('td')[0].find('img'):
lst_photo_url = lst.find_all('td')[0].find('img')['src']
count+=1
else: continue
print('#',count, lst_title, r.search("lst_title"))
return lst_link
fetch_post_list()
r.search("lst_title")
This is searching inside the string literal "lst_title", not the variable named lst_title, that's why it never matches.
r=re.compile("[BMW]")
The square brackets indicate that you're looking for one of those characters. So, for example, any string containing M will match. You just want "BMW". In fact you don't even need regular expressions, you can just test:
"BMW" in lst_title

Categories

Resources