How do I get the link inside href?

How do I get the link inside href? - python

I am building a bot, and going to get the href part out, which is /VegSpringRoll/status/1205121838302420993, from the html of twitter.com below,
<a class="css-4rbku5 css-18t94o4 css-901oao r-1re7ezh r-1loqt21 r-1q142lx r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-3s2u2q r-qvutc0" title="9:46 PM · Dec 12, 2019" href="/VegSpringRoll/status/1205121838302420993" dir="auto" aria-label="Dec 12" role="link" data-focusable="true"</a>
my script is:
class TwitterBot:
def __init__(self, username, password):
self.username = username
self.password = password
self.bot = webdriver.Firefox()
def login(self):
bot = self.bot
bot.get('https://twitter.com/login')
time.sleep(1)
email = bot.find_element_by_class_name('js-username-field.email-input.js-initial-focus')
password = bot.find_element_by_class_name('js-password-field')
email.clear()
password.clear()
email.send_keys(self.username)
password.send_keys(self.password)
password.send_keys(Keys.RETURN)
time.sleep()
def like_tweet(self,hashtag):
bot = self.bot
bot.get('https://twitter.com/search?q=%23' + hashtag + '&src=type')
time.sleep(1)
for i in range(1,10):
bot.execute_script('window.scrollTo(0,document.body.scrollHeight)')# this scroll 1 time only.
time.sleep(1)
tweets = bot.find_elements_by_class_name('css-4rbku5 css-18t94o4 css-901oao r-1re7ezh r-1loqt21 r-1q142lx r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-3s2u2q r-qvutc0')
links = [elem.get_attribute('href') for elem in tweets]
print(links)
everything works until the tweets part.
but nothing get printed. would anybody please assist?

Selenium compound class names are not permitted and you have to use css selector or xpath. Following code should work
tweets = bot.find_elements_by_css_selector('.css-4rbku5.css-18t94o4.css-901oao.r-1re7ezh.r-1loqt21.r-1q142lx.r-1qd0xha.r-a023e6.r-16dba41.r-ad9z0x.r-bcqeeo.r-3s2u2q.r-qvutc0')
links = [elem.get_attribute('href') for elem in tweets]
print(links)
Please read this discussion to get more info.

Related

scrape replies to a tweet using Python Jupyter Notebook

I've seen some questions and posts on how to scrape tweets of a specific handle, but not on how to do so to get all the replies to a particular tweet using Python via Jupyter Notebook.
Example: I want to scrape and export to Excel all the 340 replies to this public BBC tweet "Microplastics found in fresh Antarctic snow for the first time" (https://twitter.com/BBCWorld/status/1534777385249390593)
I need the following info: Reply date, Reply to (so I only get the replies to BBC, and not to other users in this thread) and the Reply text.
Inspecting the elements of the URL, I see that the reply container's class is named: css-1dbjc4n. Likewise:
The Reply date's class is: css-1dbjc4n r-1loqt21 r-18u37iz r-1ny4l3l r-1udh08x r-1qhn6m8 r-i023vh r-o7ynqc r-6416eg
The Reply to's class is: css-4rbku5 css-18t94o4 css-901oao r-14j79pv r-1loqt21 r-1q142lx r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-3s2u2q r-qvutc0
And the Reply text's class is: css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0
I have tried to run the code below, but the list remains empty :(
Results so far:
Empty DataFrame
Columns: [Date of Tweet, Replying to, Tweet]
Index: []
Can anyone help me, please?
Many thanks! :)
Code:
import sys
sys.path.append("path to site-packages in your pc")
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
driver = webdriver.Chrome(executable_path=r"C:chromedriver path in your pc")
dates=[] #List to store date of tweet
replies=[] #List to store reply to info
comments=[] #List to store comments
driver.get("https://twitter.com/BBCWorld/status/1534777385249390593")
twts=[]
content = driver.page_source
soup = BeautifulSoup(content)
for a in soup.findAll('div',href=True, attrs={'class':'css-1dbjc4n'}):
datetweet=a.find('div', attrs={'class':'css-1dbjc4n r-1loqt21 r-18u37iz r-1ny4l3l r-1udh08x r-1qhn6m8 r-i023vh r-o7ynqc r-6416eg'})
replytweet=a.find('div', attrs={'class':'css-4rbku5 css-18t94o4 css-901oao r-14j79pv r-1loqt21 r-1q142lx r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-3s2u2q r-qvutc0'})
commenttweet=a.find('div', attrs={'class':'css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0'})
dates.append(datetweet.text)
replies.append(replytweet.text)
comments.append(commenttweet.text)
df = pd.DataFrame({'Date of Tweet':dates,'Replying to':replies,'Tweet':comments})
df.to_csv('tweets.csv', index=False, encoding='utf-8')
print(df)

I found two problems:
page uses JavaScript to add elements and JavaScript may need time to add all elements to HTML - you may need time.sleep(...) before you get driver.page_source. OR use waits in Selenium to wait for some objects (before you get driver.page_source).
HTML doesn't use <div href="..."> so your findAll('div', href=True, ...) is wrong. You have to remove href=True
EDIT:
I put code which I created but it needs also to scroll page to get more tweets. And later it may need to click Show more replies to get even more tweets.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
#from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
import pandas as pd
import time
#driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()))
driver.get("https://twitter.com/BBCWorld/status/1534777385249390593")
time.sleep(10)
# TODO: scroll page to get more tweets
#for _ in range(2):
# last = driver.find_elements(By.XPATH, '//div[#data-testid="cellInnerDiv"]')[-1]
# driver.execute_script("arguments[0].scrollIntoView(true)", last)
# time.sleep(3)
all_tweets = driver.find_elements(By.XPATH, '//div[#data-testid]//article[#data-testid="tweet"]')
tweets = []
print(len(all_tweets)-1)
for item in all_tweets[1:]: # skip first tweet because it is BBC tweet
#print('--- item ---')
#print(item.text)
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[#data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
tweets.append([date, replying_to, text])
df = pd.DataFrame(tweets, columns=['Date of Tweet', 'Replying to', 'Tweet'])
df.to_csv('tweets.csv', index=False, encoding='utf-8')
print(df)

Message: element click intercepted

I am trying to get the name of the restaurant and the address from this website: [https://www.kravekar.com/restaurants][webPage]
The issue is that each time I return to the main page, I get this error:
Element <div class="restaurant-card">...</div> is not clickable at point (1129, 435). Other element would receive the click: <i class="fa fa-spinner fa-pulse"></i>
I tried to implement a driver refresh, a time sleep but is not working. I got the same error in the third iteration.
So far this is my reproducible code:
driver.get('https://www.kravekar.com/restaurants')
comment_button = driver.find_elements(by =By.CSS_SELECTOR, value = "div.restaurant-card")
result = []
for btn in comment_button :
btn.click()
try:
name = driver.find_element(by=By.XPATH, value = '//*.
[#id="restaurant_menu_head"]/div/div/div[2]/div[1]/div/div/h4')
name = name.text
print(name)
address = driver.find_element(by = By.XPATH, value = '//*
[#id="restaurant_menu_head"]/div/div/div[2]/div[1]/div/div/div/span')
address = address.text
print(address)
except:
print("No address or name")
driver.execute_script("window.history.go(-1)")

When you do btn.click() or driver.execute_script("window.history.go(-1)") it might be possible that the reference to the correct webpage is lost. So it is better to store the url of all the restaurants right from the home page, and then loop over the stored urls.
driver.get('https://www.kravekar.com/restaurants')
cards = driver.find_elements(By.CSS_SELECTOR, ".restaurant-card-wrapper")
urls = [card.get_attribute('href') for card in cards]
names = [card.find_element(By.CSS_SELECTOR, ".restaurant-card-title").text for card in cards]
for idx,url in enumerate(urls):
try:
driver.get(url)
# name = driver.find_element(By.CSS_SELECTOR, "#tab_menu_info h4.media-heading").text
print(names[idx])
address = driver.find_element(By.CSS_SELECTOR, "#tab_menu_info .restaurant_menu_info-addresss").text
print(address)
except:
print("No address or name")
which outputs
Arby's
51171 Highway 6 & 24, Glenwood Springs, CO 81601
Springs Bar and Grill
722 Grand Ave, Glenwood Springs, CO 81601
etc.

selenium python: Unable to parse data in <object> tag with #document section

url = 'http://www.mtv.de/charts/c6mc86/single-top-100?expanded=true'
chromedriver = Service("/usr/local/bin/chromedriver")
op = webdriver.ChromeOptions()
browser = webdriver.Chrome(service=chromedriver, options=op)
browser.get(url)
timeout = 60
browser.implicitly_wait(20)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight,)")
time.sleep(5)
try:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, '/html/body/div[1]/main/div/section/div/div/div/object')))
print('========================')
except TimeoutException:
browser.quit()
items = browser.switch_to.frame(browser.find_element(By.TAG_NAME,'object'))
print(items)
itembox = items.find_elements(By.CLASS_NAME, 'charts-marslnet')
# print(itembox)
for item in itembox:
print(item.text)
I have been trying to scrap the song name, author and url for the song from this website but unable to access the html inside the tag under #document section. I am not able to figure why i cant access it. Any insights on what can be the issue with my code or what should be done to access this html inside #document section would be very helpful.
[HTML inside the tag with #document(Screenshot 2][1]

You can grab it from the direct url:
import requests
from bs4 import BeautifulSoup
url = 'https://mtv.marsl.net/demo/showdbcharts.php?c=4'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
acts = soup.find_all('div', {'class':'cmn-act'})
for each in acts:
title = each.find_next('div', {'class':'cmn-title'}).text.strip()
artist = each.find_next('div', {'class':'cmn-artist'}).text.strip()
link = each.find_next('a', href=True)['href']
print(f'{title}\n{artist}\n{link}\n\n')
Output:
abcdefu
Gayle
https://www.mtv.de/musikvideos/r9d9sl/abcdefu
Wenn ich will
Gzuz & Bonez MC
https://www.mtv.de/musikvideos/7evkst/10von10
10von10
Pajel
https://www.mtv.de/musikvideos/7evkst/10von10
Shivers
Ed Sheeran
https://www.mtv.de/musikvideos/miq9lq/shivers
Heat Waves
Glass Animals
https://www.mtv.de/musikvideos/l9rv5d/heat-waves
...

Not able to scrape information from a website using lxml

I am trying to scrape data of user review beer on beeradvocate.com to analyze user attitude towards different of beer type. But I can only have result of the first few page, remain is empty
Situation:
There are 500 different type of beer, each beer has different number of rating and reviews
Site only show 1 page of results for guest, to see all the information, you need to login
My approach
Get the beer link, number of rating of each beer to define range of loop for each beer
Login using request session and post
def review_scrape (beer_link, number_of_ratings):
reviews=[]
rate =[]
for pages_i in range(0,int(number_of_ratings),25): #site shows 25 resulst/page
session = requests.session() # Start the session
payload = {'login':'suzie102', 'password':''}
page1 = session.post("https://www.beeradvocate.com/community/login/login", data=payload)
url = beer_link+'/?view=beer&sort=&start=%d'%(pages_i)
page1= session.get(url)
time.sleep(3)
soup1 = lxml.html.fromstring(page1.text)
rate_i = soup1.xpath('//span[#class = "muted"]/text()')[8::3]
print(url)
reviews_i = soup1.xpath('//div/text()')
reviews.append(reviews_i)
print(len(reviews))
rate.append(rate_i)
return rate,reviews
Results:

There is only one problem that ive see.
url = beer_link+'/?view=beer&sort=&start=%d'%(pages_i)
/ is redudant, what you need is
url = beer_link+'?view=beer&sort=&start=%d'%(pages_i)
that is why there are //?view in your print of links.
I can see that there are anchor links "next" leding to next page. I would recommend while loop or recursion.
Other than that, I cant see what is missing from your script. Everything else looks in order and it should work.
If you could give us more details, we might have more to work with.

update, thanks to everyone comment, I tried to use it with selenium to scrape. It works now
def webstite_scrape_p2 (beer_link, number_of_ratings):
driver = webdriver.Chrome('/home/sam/Downloads/chromedriver')
url = 'https://www.beeradvocate.com/community/login/'
driver.get(url)
loginelement = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[#class="xenForm formOverlay"]//dd//input[#name ="login"]')))
loginelement.send_keys('suzie102')
pwelement = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[#class="xenForm formOverlay"]//dl[#class ="ctrlUnit"]//dd//ul//li[#id = "ctrl_pageLogin_registered_Disabler"]//input[#name ="password"]')))
pwelement.send_keys('')
page_click = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[#class="xenForm formOverlay"]//dl[#class ="ctrlUnit submitUnit"]//dd//input[#type ="submit"]')))
page_click.click()
rate = []
reviews =[]
avg_user =[]
for link, i in zip(beer_link, number_of_rev):
for pages_i in tqdm(range(0,int(i),25)): #site shows 25 resulst/page)
new_url = link+'?view=beer&sort=&start=%d'%(pages_i)
print(new_url)
driver.get(new_url)
#print(driver.find_element_by_name("hideRatings").is_selected())
#check_box = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH, '//form[#style="display:inline;margin:0;padding:0;"]//input[#type = "checkbox"]')))#check_box.click()
#check_box.click()
time.sleep(5)
driver.get(new_url)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
rate_i = [ i.get_text() for i in soup.find_all('span', class_ = "muted")][8::3]
rate.append(rate_i)
reviews_i = [ i.get_text() for i in soup.find_all('div')]
reviews.append(reviews_i)
avg_i = [i.get_text() for i in soup.find_all('span', class_= "BAscore_norm")]
avg_user.append(avg_i)
return rate, reviews, avg_user

can i get data in b tag under the a tag with selenium with python?

can i get data in b tag under the a tag with selenium with python?
if i can, how? would you tell me solution?
this is structure of html
...
<div class = "cont_inner">
<div class = "wrap_tit_ mg_tit">
<a href = "href="https://cp.news.search.daum.net/p/97048679" class"f_link_b" onclick="smartLog(this, "dc=NNS&d=26DQnlvsWTMHk5CtBf&pg=6&r=2&p=4&rc=10&e1=163cv75CcAF31EvlGD&e3=0&ext=dsid=26DQnlvsWTMHk5CtBf", event, {"cpid": {"value": "163cv75CcAF31EvlGD"}});" target = "_blank"> == $0
"하남지역자활센터,"
<b>보건복지부</b>
"간이평가 우수기관"
</a>
</div>
and i wanna get data as like
"하남지역자활센터, 보건복지부 간이평가우수기관"
this is my code state
[['"하남지역자활센터, , 간이평가 우수기관"']]
and this is my source code for crawling data on the website
class crwaler_daum:
def __init__(self):
self.title = []
self.body = []
self.url = input("please enter url for crawling data")
self.page = input('please enter number of page to get data')
def get_title(self):
return self.title
def set_title(self , title):
self.title.append(title)
def get_body(self):
return self.body
def set_body(self , body):
self.body.append(body)
def crwaling_title(self):
title_list = []
chrome_driver = webdriver.Chrome('D:/바탕 화면/인턴/python/crwaler/news_crawling/chromedriver.exe')
url = self.url
response = requests.get(url , verify = False)
root = lxml.html.fromstring(response.content)
chrome_driver.get(url)
for i in range(int(self.page) + 1):
for j in root.xpath('//*[#id="clusterResultUL"]/li'):
title_list.append((j.xpath('div[2]/div/div[1]/a/text()')))
print(title_list)
chrome_driver.get('https://search.daum.net/search?w=news&DA=PGD&enc=utf8&cluster=y&cluster_page=3&q=%EB%B3%B4%EA%B1%B4%EB%B3%B5%EC%A7%80%EB%B6%80&p={}'.format(i))

lxml has a built in function ".text_content()" which "Returns the text content of the element, including the text content of its children, with no markup.". But after using this function, you should manipulate the string to acquire it like you want. I hope you will understand what I mean better with code below, but it may not be quite practical because I'm a beginner in Python too but it solves problem for now.
import lxml.html
html = '''
<div class = "cont_inner">
<div class = "wrap_tit_ mg_tit">
<a href = "href="https://cp.news.search.daum.net/p/97048679" class"f_link_b" onclick="smartLog(this, "dc=NNS&d=26DQnlvsWTMHk5CtBf&pg=6&r=2&p=4&rc=10&e1=163cv75CcAF31EvlGD&e3=0&ext=dsid=26DQnlvsWTMHk5CtBf", event, {"cpid": {"value": "163cv75CcAF31EvlGD"}});" target = "_blank">
"하남지역자활센터,"
<b>보건복지부</b>
"간이평가 우수기관"
</a>
</div>'''
my_html = lxml.html.fromstring(html)
a_element = my_html.xpath('//div[#class="wrap_tit_ mg_tit"]/a')
print(a_element[0].text_content())
def prettify_string(string):
string = string.replace("\n", "").replace("\"", "").split(" ")
while "" in string:
string.remove("")
string = " ".join(string)
return string
"""
Prints:
"하남지역자활센터,"
보건복지부
"간이평가 우수기관"
"""
print(prettify_string(str(a_element[0].text_content())))
"""
Prints:
하남지역자활센터, 보건복지부 간이평가 우수기관
"""

I haven't used lxml crawler yet, but you can use BeautifulSoup instead.
from bs4 import BeautifulSoup
chrome_driver = webdriver.Chrome('your direction')
chrome_driver.get('your url')
html = chrome_driver.page_source
soup = BeautifulSoup(html, 'html.parser')
b_tag = soup.find_all('b')

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How do I get the link inside href? - python

Related

scrape replies to a tweet using Python Jupyter Notebook

Message: element click intercepted

selenium python: Unable to parse data in <object> tag with #document section

Not able to scrape information from a website using lxml

can i get data in b tag under the a tag with selenium with python?

Categories

Resources