I am trying to scrape some words from any random website, but the following program is not showing errors and not showing any output when i tried printing the results.
I have checked the code twice and even incorporated an if statement to see whether the program is getting any words or not.
import requests
import operator
from bs4 import BeautifulSoup
def word_count(url):
wordlist = []
source_code = requests.get(url)
source = BeautifulSoup(source_code.text, features="html.parser")
for post_text in source.findAll('a', {'class':'txt'}):
word_string=post_text.string
if word_string is not None:
word = word_string.lower().split()
for each_word in word:
print(each_word)
wordlist.append(each_word)
else:
print("None")
word_count('https://mumbai.craigslist.org/')
I am expecting all the words under the "class= txt" to be displayed in the output.
OP: I am expecting all the words of the class text to be displayed in the output
The culprit:
for post_text in source.findAll('a', {'class':'txt'}):
The reason:
anchor tag has no class txt but the span tag inside it does.
Hence:
import requests
from bs4 import BeautifulSoup
def word_count(url):
source_code = requests.get(url)
source=BeautifulSoup(source_code.text, features="html.parser")
for post_text in source.findAll('a'):
s_text = post_text.find('span', class_ = "txt")
if s_text is not None:
print(s_text.text)
word_count('https://mumbai.craigslist.org/')
OUTPUT:
community
activities
artists
childcare
classes
events
general
groups
local news
lost+found
missed connections
musicians
pets
.
.
.
You are targeting the wrong elements.
if you use
print(source)
Everything works fine but the moment you try to target the element with findAll you are targeting something wrong because you get an empty list array.
If you replace
for post_text in source.findAll('a', {'class':'txt'}):
with
for post_text in source.find_all('a'):
everyting seems to work fine
I have visited https://mumbai.craigslist.org/, and find there is no <a class="txt">, only <span class="txt">, so I think you can try this:
def word_count(url):
wordlist = []
source_code = requests.get(url)
source=BeautifulSoup(source_code.text, features="html.parser")
for post_text in source.findAll('span', {'class':'txt'}):
word_string=post_text.text
if word_string is not None:
word = word_string.lower().split ()
for each_word in word:
print(each_word)
wordlist.append(each_word)
else:
print("None")
it will output correctly:
community
activities
artists
childcare
classes
events
general
...
Hope that helps you, and comment if you have further questions. : )
Related
im begginer at programming, so i have problem in find method in beautifuloup when i use it in web scraping,i have this code
from os import execle, link, unlink, write
from typing import Text
import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
job_titleL =[]
company_nameL=[]
location_nameL=[]
experience_inL=[]
links=[]
salary=[]
job_requirementsL=[]
date=[]
result= requests.get(f"https://wuzzuf.net/search/jobs/?a=%7B%7D&q=python&start=1")
source = result.content
soup= BeautifulSoup(source , "lxml")
job_titles = soup.find_all("h2",{"class":"css-m604qf"} )
companies_names = soup.find_all("a",{"class":"css-17s97q8"})
locations_names = soup.find_all("span",{"class":"css-5wys0k"})
experience_in = soup.find_all("a", {"class":"css-5x9pm1"})
posted_new = soup.find_all("div",{"class":"css-4c4ojb"})
posted_old = soup.find_all("div",{"class":"css-do6t5g"})
posted = [*posted_new,*posted_old]
for L in range(len(job_titles)):
job_titleL.append(job_titles[L].text)
links.append(job_titles[L].find('a').attrs['href'])
company_nameL.append(companies_names[L].text)
location_nameL.append(locations_names[L].text)
experience_inL.append(experience_in[L].text)
date_text=posted[L].text.replace("-","").strip()
date.append(posted[L].text)
for link in links:
result= requests.get(link)
source= result.content
soup=BeautifulSoup(source,"lxml")
requirements=soup.find("div",{"class":"css-1t5f0fr"}).ul
requirements1=soup.find("div",{"class":"css-1t5f0fr"}).p
respon_text=""
if requirements:
for li in requirements.find_all("li"):
print(li)
if requirements1:
for br in requirements1.find_all("br"):
print(br)
respon_text +=li.text + "|"
job_requirementsL.append(respon_text)
file_list=[job_titleL,company_nameL,date,location_nameL,experience_inL,links,job_requirementsL]
exported=zip_longest(*file_list)
with open('newspeard2.csv',"w") as spreadsheet:
wr=csv.writer(spreadsheet)
wr.writerow(["job title", "company name","date", "location", "experience in","links","job requirements"])
wr.writerows(exported)
note: im not very good at english :(
so when i use find method to get the job requirements from each job in the website page (wuzzuf),use for loop to loop throug each text i job requirements, it returns error says:"NoneType objects han nod attribute find_all("li"), so after searching why this happens ,and after dowing inspect for each job page , i found that some job pages uses (br, p and strong) tags in job requirements, i didn't know what to do , but i used if statement to test it, it returns the tags but br tag is empty without text , so please can you see where is the prblem and answer me , thanks
the webpage:
https://wuzzuf.net/search/jobs/?a=hpb&q=python&start=1
the job used p and br tags:
https://wuzzuf.net/jobs/p/T9WuTpM3Mveq-Senior-Data-Scientist-Evolvice-GmbH-Cairo-Egypt?o=28&l=sp&t=sj&a=python|search-v3|hpb
sorry i didn't understand the problem with p sooner
for link in links:
result= requests.get(link)
source= result.content
soup=BeautifulSoup(source,"lxml")
requirements_div=soup.find("div",{"class":"css-1t5f0fr"})
respon_text=[]
for child in requirements_div.children:
if child.name=='ul':
for li in child.find_all("li"):
respon_text.append(li.text)
elif child.name=='p':
for x in child.contents:
if x.name == 'br':
pass
elif x.name == 'strong':
respon_text.append(x.text)
else:
respon_text.append(x)
job_requirementsL.append('|'.join(respon_text))
I try to scam URL link form google. Users can input any search then they can take a URL link. but here is the main problem is this split function can't work. I can't fix it. So please help me
[[Suppose: Any user can input "all useless website" that time google can showing us a result. User can take only URL link.]]
from typing import re
from bs4 import BeautifulSoup
import requests
user_input = input('Enter value for search : ')
print('Please Wait')
page_source = requests.get("https://www.google.com/search?q=" + user_input)
soup = BeautifulSoup(page_source.text, 'html.parser')
print(soup.title)
print(soup.title.string)
print(soup.title.parent.name)
all_links = soup.find_all('a')
for link in all_links:
link_google = re.split(":(?=http)", link["href"].replace("/url?q=", ""))
print(link_google.find["a"])
You're importing re from the wrong place. You need to use it via import re, as follows:
import re
...
link_google = re.split(":(?=http)", link["href"].replace("/url?q=", ""))
Update to make your code work:
import re correctly
fix this line from all_links = soup.find_all('a') to all_links = soup.find_all('a', href=True)
Take the link and clean it up like you did before (re.split() works perfectly but it returns a list) and add that link to a list (unpack the list) or print it
Here is the code updated to make it work
# issue 1
import re
from bs4 import BeautifulSoup
import requests
user_input = input('Enter value for search : ')
print('Please Wait')
page_source = requests.get("https://www.google.com/search?q=" + user_input)
soup = BeautifulSoup(page_source.text, 'html.parser')
print(soup.title)
print(soup.title.string)
print(soup.title.parent.name)
# issue 2
all_links = soup.find_all('a', href=True)
for link in all_links:
link_from_google = re.split(":(?=http)", link["href"].replace("/url?q=", ""))
# issue 3
print(link_from_google[0])
>>> {returns all the http links}
One liner list comprehension for fun
google_links = [re.split(":(?=http)", link["href"].replace("/url?q=", ""))[0] for link in soup.find_all('a', href=True)]
>>> {returns all the http links}
In order to practice and help my sister get emails from doctors for her baby, I have designed this email harvester. It makes a search, cleans the urls given, adds them to a dictionary and parse them for emails in two different ways.
The code has been taken from different places, so if you correct me, please explain clearly your improvement, as I am working at the limit of my knowledge already.
The question is how to get emails better (and improve code, if possible). I'll post the code and the exact output below:
CODE of my program:
import requests, re, webbrowser, bs4
from selenium import webdriver
from bs4 import BeautifulSoup
import time, random, webbrowser
import urllib.request
def google_this(): #Googles and gets the first few links
search_terms = ['Fiat','Lambrusco']
added_terms = 'email contact? #'
#This searches for certain keywords in Google and parses results with BS
for el in search_terms:
webpage = 'http://google.com/search?q=' + str(el) + str(added_terms)
print('Searching for the terms...', el,added_terms)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
serp_res_rawlink = soup.select('.r a')
dicti = [] #This gets the href links
for link in serp_res_rawlink:
url = link.get('href')
if 'pdf' not in url:
dicti.append(url)
dicti_url = [] #This cleans the "url?q=" from link
for el in dicti:
if '/url?q=' in el:
result = (el.strip('/url?q='))
dicti_url.append(result)
#print(dicti_url)
dicti_pretty_links = [] #This cleans the gibberish at end of url
for el in dicti_url[0:4]:
pretty_url = el.partition('&')[0]
dicti_pretty_links.append(pretty_url)
print(dicti_pretty_links)
for el in dicti_pretty_links: #This converts page in BS soup
# browser = webdriver.Firefox()
# browser.get(el)
# print('I have been in the element below and closed the window')
# print(el)
# time.sleep(1)
# browser.close()
webpage = (el)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
#This is the first way to search for an email in soup
emailRegex = re.compile(r'([a-zA-Z0-9_.+]+#+[a-zA-Z0-9_.+])', re.VERBOSE)
mo = emailRegex.findall(res.text)
#mo = emailRegex.findall(soup.prettify())
print('THIS BELOW IS REGEX')
print(mo)
#This is the second way to search for an email in soup:
mailtos = soup.select('a[href^=mailto]')
for el in mailtos:
print('THIS BELOW IS MAILTOS')
print(el.text)
time.sleep(random.uniform(0.5,1))
google_this()
And here is the OUTPUT when this very same code above. As you can see, some emails seem to be found, but at cut just after the "#" symbol:
C:\Users\SK\AppData\Local\Programs\Python\Python35-32\python.exe C:/Users/SK/PycharmProjects/untitled/another_temperase.py
Searching for the terms... Fiat email contact? #
['http://www.fcagroup.com/en-US/footer/Pages/contacts.aspx', 'http://www.fiat.co.uk/header-contacts', 'http://www.fiatusa.com/webselfservice/fiat/', 'https://twitter.com/nic_fincher81/status/672505531689394176']
THIS BELOW IS REGEX
['investor.relations#f', 'investor.relations#f', 'sustainability#f', 'sustainability#f', 'mediarelations#f', 'mediarelations#f']
THIS BELOW IS MAILTOS
investor.relations#fcagroup.com
THIS BELOW IS MAILTOS
sustainability#fcagroup.com
THIS BELOW IS MAILTOS
mediarelations#fcagroup.com
THIS BELOW IS REGEX
[]
THIS BELOW IS REGEX
[]
THIS BELOW IS REGEX
['nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y', 'nic_fincher81#y']
Searching for the terms... Lambrusco email contact? #
['http://www.labattagliola.it/%3Flang%3Den']
Process finished with exit code 0
I would recommend a more restrictive version that still catches all of the email:
([a-zA-Z0-9_.+]+#[a-zA-Z0-9_.+]+)
The problem of not catching anything after the first letter after the # is because the regex is missing a +
([a-zA-Z0-9_.+]+#+[a-zA-Z0-9_.+]+)
Originally this part [a-zA-Z0-9_.+] simply said to catch one of any of the following characters a-z, A-Z, 0-9, ., _,+.
I would also be careful about #+ which says to catch 1 or more "#" symbols.
So a potentially valid email could look like this:
..................########################.................
So i have beautiful soup code which visit the main the main page of a website
and scrapes the links there. However when I get the links in python I can't seem to clean up the link (after it's converted to a string) for concatenation with the root url.
import re
import requests
import bs4
list1=[]
def get_links():
regex3= re.compile('/[a-z\-]+/[a-z\-]+')
response = requests.get('http://noisetrade.com')
soup = bs4.BeautifulSoup(response.text)
links= soup.select('div.grid_info a[href]')
for link in links:
lk= link.get('href')
prtLk= regex3.findall(lk)
list1.append(prtLk)
def visit_pages():
url1=str(list1[1])
print(url)
get_links()
visit_pages()
produces output: "['/stevevantinemusic/unsolicited-material']"
desired output:"/stevevantinemusic/unsolicited-material"
I have tried .strip() and .replace() and re.sub/match/etc. . . I can't seem to isolate the chars '[,\',]' which are the characters I need removed, I had iterated through it with sub-strings but that feels inefficient. I'm sure I'm missing something obvious.
findall returns a list of results so you can either write:
for link in links:
lk = link.get('href')
urls = regex3.findall(lk)
if urls:
prtLk = urls[0]
list1.append(prtLk)
or better, use search method:
for link in links:
lk = link.get('href')
m = regex3.search(lk)
if m:
prtLk = m.group()
list1.append(prtLk)
Those brackets were the result of converting a list with one element to a string.
For example:
l = ['text']
str(l)
results in:
"['text']"
Here I use the regexp r'[\[\'\]]' to replace any of the unwanted characters with the empty string:
$ cat pw.py
import re
def visit_pages():
url1="['/stevevantinemusic/unsolicited-material']"
url1 = re.sub(r'[\[\'\]]','',url1)
print(url1)
visit_pages()
$ python pw.py
/stevevantinemusic/unsolicited-material
Here is an example of what I think you are trying to do:
>>> import bs4
>>> with open('noise.html', 'r') as f:
... lines = f.read()
...
>>> soup = bs4.BeautifulSoup(lines)
>>> root_url = 'http://noisetrade.com'
>>> for link in soup.select('div.grid_info a[href]'):
... print(root_url + link.get('href'))
...
http://noisetrade.com/stevevantinemusic
http://noisetrade.com/stevevantinemusic/unsolicited-material
http://noisetrade.com/jessicarotter
http://noisetrade.com/jessicarotter/winter-sun
http://noisetrade.com/geographermusic
http://noisetrade.com/geographermusic/live-from-the-el-rey-theatre
http://noisetrade.com/kaleo
http://noisetrade.com/kaleo/all-the-pretty-girls-ep
http://noisetrade.com/aviddancer
http://noisetrade.com/aviddancer/an-introduction
http://noisetrade.com/thinkr
http://noisetrade.com/thinkr/quiet-kids-ep
http://noisetrade.com/timcaffeemusic
http://noisetrade.com/timcaffeemusic/from-conversations
http://noisetrade.com/pearl
http://noisetrade.com/pearl/hello
http://noisetrade.com/staceyrandolmusic
http://noisetrade.com/staceyrandolmusic/fables-noisetrade-sampler
http://noisetrade.com/sleepyholler
http://noisetrade.com/sleepyholler/sleepy-holler
http://noisetrade.com/sarahmcgowanmusic
http://noisetrade.com/sarahmcgowanmusic/indian-summer
http://noisetrade.com/briandunne
http://noisetrade.com/briandunne/songs-from-the-hive
Remember also bs4 has its own types that it uses.
A good way to debug your scripts would be to place:
for link in links:
import pdb;pdb.set_trace() # the script will stop for debugging here
lk= link.get('href')
prtLk= regex3.findall(lk)
list1.append(prtLk)
Anywhere you want to debug.
And then you could do something like this within pdb:
next
l
print(type(lk))
print(links)
dir()
dir(links)
dir(lk)
As i want to remove duplicated placeholders in a html website, i use the .next_sibling operator of BeautifulSoup. As long as the duplicates are in the same line, this works fine (see data). But sometimes there is a empty line between them - so i want .next_sibling to ignore them (have a look at data2)
That is the code:
from bs4 import BeautifulSoup, Tag
data = "<p>method-removed-here</p><p>method-removed-here</p><p>method-removed-here</p>"
data2 = """<p>method-removed-here</p>
<p>method-removed-here</p>
<p>method-removed-here</p>
<p>method-removed-here</p>
<p>method-removed-here</p>
"""
soup = BeautifulSoup(data)
string = 'method-removed-here'
for p in soup.find_all("p"):
while isinstance(p.next_sibling, Tag) and p.next_sibling.name== 'p' and p.text==string:
p.next_sibling.decompose()
print(soup)
Output for data is as expected:
<html><head></head><body><p>method-removed-here</p></body></html>
Output for data2 (this needs to be fixed):
<html><head></head><body><p>method-removed-here</p>
<p>method-removed-here</p>
<p>method-removed-here</p>
<p>method-removed-here</p>
<p>method-removed-here</p>
</body></html>
I couldn't find useful information for that in the BeautifulSoup4 documentation and .next_element is also not what i am looking for.
I could solve this issue with a workaround. The problem is described in the google-group for BeautifulSoup and they suggest to use a preprocessor for html-files:
def bs_preprocess(html):
"""remove distracting whitespaces and newline characters"""
pat = re.compile('(^[\s]+)|([\s]+$)', re.MULTILINE)
html = re.sub(pat, '', html) # remove leading and trailing whitespaces
html = re.sub('\n', ' ', html) # convert newlines to spaces
# this preserves newline delimiters
html = re.sub('[\s]+<', '<', html) # remove whitespaces before opening tags
html = re.sub('>[\s]+', '>', html) # remove whitespaces after closing tags
return html
That's not the very best solution but one.
Also not a great solution but this worked for me
def get_sibling(element):
sibling = element.next_sibling
if sibling == "\n":
return get_sibling(sibling)
else:
return sibling
use find_next_sibling() instead of next_sibling. also find_previous_sibling() instead of previous_sibling.
reason: next_sibling does not only return the next html tag but also the next "soup element". usually that is the whitespace between tags but can be more. find_next_sibling() on the other hand returns the next html tag ignoring whitespace and other crud between the tags.
i restructured your code a bit to make this demonstration. i hope it is semantically the same.
code with next_sibling demonstrating the same behaviour that you described (works for data but not data2)
from bs4 import BeautifulSoup, Tag
data = "<p>method-removed-here</p><p>method-removed-here</p><p>method-removed-here</p>"
data2 = """<p>method-removed-here</p>
<p>method-removed-here</p>
<p>method-removed-here</p>
<p>method-removed-here</p>
<p>method-removed-here</p>
"""
soup = BeautifulSoup(data, 'html.parser')
string = 'method-removed-here'
for p in soup.find_all("p"):
while True:
ns = p.next_sibling
if isinstance(ns, Tag) and ns.name== 'p' and p.text==string:
ns.decompose()
else:
break
print(soup)
code with find_next_sibling() which works for both data and data2
soup = BeautifulSoup(data, 'html.parser')
string = 'method-removed-here'
for p in soup.find_all("p"):
while True:
ns = p.find_next_sibling()
if isinstance(ns, Tag) and ns.name== 'p' and p.text==string:
ns.decompose()
else:
break
print(soup)
the same behaviour (returning all soup elements including unwanted whitespace) in other parts of beautifulsoup: BeautifulSoup .children or .content without whitespace between tags
Improving a bit neurosnap answer by making it general:
def next_elem(element, func):
new_elem = getattr(element, func)
if new_elem == "\n":
return next_elem(new_elem, func)
else:
return new_elem
Now you can call any function with it, for example:
next_elem(element, 'previous_sibling')