Script produces wrong results when linebreak comes into play - python

I've written a script in python to scrape some disorganized content located within b tags and thier next_sibling from a webpage. The thing is my script fails when linebreaks come between. I'm trying to extract the title's and their concerning description from that page starting from CHIEF COMPLAINT: Bright red blood per rectum to just before Keywords:.
Website address
I've tried so far with:
import requests
from bs4 import BeautifulSoup
url = 'https://www.mtsamples.com/site/pages/sample.asp?Type=24-Gastroenterology&Sample=941-BloodperRectum'
res = requests.get(url)
soup = BeautifulSoup(res.text,'lxml')
for item in soup.select_one("hr").find_next_siblings('b'):
print(item.text,item.next_sibling)
The portion of output giving me unwanted results are like:
LABS: <br/>
CBC: <br/>
CHEM 7: <br/>
How can I get the titles and their concerning description accordingly?

Here's a scraper that's more robust compared to yesterday's solutions.
How to loop through scraping multiple documents on multiple web pages using BeautifulSoup?
How can I grab the entire body text from a web page using BeautifulSoup?
It extracts, title, description and all sections properly
import re
import copy
import requests
from bs4 import BeautifulSoup, Tag, Comment, NavigableString
from urllib.parse import urljoin
from pprint import pprint
import itertools
import concurrent
from concurrent.futures import ThreadPoolExecutor
BASE_URL = 'https://www.mtsamples.com'
def make_soup(url: str) -> BeautifulSoup:
res = requests.get(url)
res.raise_for_status()
html = res.text
soup = BeautifulSoup(html, 'html.parser')
return soup
def clean_soup(soup: BeautifulSoup) -> BeautifulSoup:
soup = copy.copy(soup)
h1 = soup.select_one('h1')
kw_re = re.compile('.*Keywords.*', flags=re.IGNORECASE)
kw = soup.find('b', text=kw_re)
for el in (*h1.previous_siblings, *kw.next_siblings):
el.extract()
kw.extract()
for ad in soup.select('[id*="ad"]'):
ad.extract()
for script in soup.script:
script.extract()
for c in h1.parent.children:
if isinstance(c, Comment):
c.extract()
return h1.parent
def extract_meta(soup: BeautifulSoup) -> dict:
h1 = soup.select_one('h1')
title = h1.text.strip()
desc_parts = []
desc_re = re.compile('.*Description.*', flags=re.IGNORECASE)
desc = soup.find('b', text=desc_re)
hr = soup.select_one('hr')
for s in desc.next_siblings:
if s is hr:
break
if isinstance(s, NavigableString):
desc_parts.append(str(s).strip())
elif isinstance(s, Tag):
desc_parts.append(s.text.strip())
description = '\n'.join(p.strip() for p in desc_parts if p.strip())
return {
'title': title,
'description': description
}
def extract_sections(soup: BeautifulSoup) -> list:
titles = [b for b in soup.select('b') if b.text.isupper()]
parts = []
for t in titles:
title = t.text.strip(': ').title()
text_parts = []
for s in t.next_siblings:
# walk forward until we see another title
if s in titles:
break
if isinstance(s, Comment):
continue
if isinstance(s, NavigableString):
text_parts.append(str(s).strip())
if isinstance(s, Tag):
text_parts.append(s.text.strip())
text = '\n'.join(p for p in text_parts if p.strip())
p = {
'title': title,
'text': text
}
parts.append(p)
return parts
def extract_page(url: str) -> dict:
soup = make_soup(url)
clean = clean_soup(soup)
meta = extract_meta(clean)
sections = extract_sections(clean)
return {
**meta,
'sections': sections
}
url = 'https://www.mtsamples.com/site/pages/sample.asp?Type=24-Gastroenterology&Sample=941-BloodperRectum'
page = extract_page(url)
pprint(page, width=2000)
output:
{'description': 'Status post colonoscopy. After discharge, experienced bloody bowel movements and returned to the emergency department for evaluation.\n(Medical Transcription Sample Report)',
'sections': [{'text': 'Bright red blood per rectum', 'title': 'Chief Complaint'},
# some elements removed for brevity
{'text': '', 'title': 'Labs'},
{'text': 'WBC count: 6,500 per mL\nHemoglobin: 10.3 g/dL\nHematocrit:31.8%\nPlatelet count: 248 per mL\nMean corpuscular volume: 86.5 fL\nRDW: 18%', 'title': 'Cbc'},
{'text': 'Sodium: 131 mmol/L\nPotassium: 3.5 mmol/L\nChloride: 98 mmol/L\nBicarbonate: 23 mmol/L\nBUN: 11 mg/dL\nCreatinine: 1.1 mg/dL\nGlucose: 105 mg/dL', 'title': 'Chem 7'},
{'text': 'PT 15.7 sec\nINR 1.6\nPTT 29.5 sec', 'title': 'Coagulation Studies'},
{'text': 'The patient receive ... ula.', 'title': 'Hospital Course'}],
'title': 'Sample Type / Medical Specialty: Gastroenterology\nSample Name: Blood per Rectum'}

Code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.mtsamples.com/site/pages/sample.asp?Type=24-Gastroenterology& Sample=941-BloodperRectum'
res = urlopen(url)
html = res.read()
soup = BeautifulSoup(html,'html.parser')
# Cut the division containing required text,used Right Click and Inspect element in broweser to find the respective div/tag
sampletext_div = soup.find('div', {'id': "sampletext"})
print(sampletext_div.find('h1').text) # TO print header
Output:
Sample Type / Medical Specialty: Gastroenterology
Sample Name: Blood per Rectum
Code:
# Find all the <b> tag
b_all=sampletext_div.findAll('b')
for b in b_all[4:]:
print(b.text, b.next_sibling)
Output:
CHIEF COMPLAINT: Bright red blood per rectum
HISTORY OF PRESENT ILLNESS: This 73-year-old woman had a recent medical history significant for renal and bladder cancer, deep venous thrombosis of the right lower extremity, and anticoagulation therapy complicated by lower gastrointestinal bleeding. Colonoscopy during that admission showed internal hemorrhoids and diverticulosis, but a bleeding site was not identified. Five days after discharge to a nursing home, she again experienced bloody bowel movements and returned to the emergency department for evaluation.
REVIEW OF SYMPTOMS: No chest pain, palpitations, abdominal pain or cramping, nausea, vomiting, or lightheadedness. Positive for generalized weakness and diarrhea the day of admission.
PRIOR MEDICAL HISTORY: Long-standing hypertension, intermittent atrial fibrillation, and hypercholesterolemia. Renal cell carcinoma and transitional cell bladder cancer status post left nephrectomy, radical cystectomy, and ileal loop diversion 6 weeks prior to presentation, postoperative course complicated by pneumonia, urinary tract infection, and retroperitoneal bleed. Deep venous thrombosis 2 weeks prior to presentation, management complicated by lower gastrointestinal bleeding, status post inferior vena cava filter placement.
MEDICATIONS: Diltiazem 30 mg tid, pantoprazole 40 mg qd, epoetin alfa 40,000 units weekly, iron 325 mg bid, cholestyramine. Warfarin discontinued approximately 10 days earlier.
ALLERGIES: Celecoxib (rash).
SOCIAL HISTORY: Resided at nursing home. Denied alcohol, tobacco, and drug use.
FAMILY HISTORY: Non-contributory.
PHYSICAL EXAM: <br/>
LABS: <br/>
CBC: <br/>
CHEM 7: <br/>
COAGULATION STUDIES: <br/>
HOSPITAL COURSE: The patient received 1 liter normal saline and diltiazem (a total of 20 mg intravenously and 30 mg orally) in the emergency department. Emergency department personnel made several attempts to place a nasogastric tube for gastric lavage, but were unsuccessful. During her evaluation, the patient was noted to desaturate to 80% on room air, with an increase in her respiratory rate to 34 breaths per minute. She was administered 50% oxygen by nonrebreadier mask, with improvement in her oxygen saturation to 89%. Computed tomographic angiography was negative for pulmonary embolism.
Keywords:
gastroenterology, blood per rectum, bright red, bladder cancer, deep venous thrombosis, colonoscopy, gastrointestinal bleeding, diverticulosis, hospital course, lower gastrointestinal bleeding, nasogastric tube, oxygen saturation, emergency department, rectum, thrombosis, emergency, department, gastrointestinal, blood, bleeding, oxygen,
NOTE : These transcribed medical transcription sample reports and examples are provided by various users and
are for reference purpose only. MTHelpLine does not certify accuracy and quality of sample reports.
These transcribed medical transcription sample reports may include some uncommon or unusual formats;
this would be due to the preference of the dictating physician. All names and dates have been
changed (or removed) to keep confidentiality. Any resemblance of any type of name or date or
place or anything else to real world is purely incidental.

Related

how to extract links from a website and extract its content in web scraping using python

I am trying to extract data from a website using beautifulSoup and requests packages
where I want to extract the links and it contents .
Until now I am bale to extract the list of the links that exist on a defined url but I do not know how to enter each link and extract the text.
the image below describe my problem :
The text and the image are the link for the hall article.
code:
import requests
from bs4 import BeautifulSoup
url = "https://www.annahar.com/english/section/186-mena"
html_text = requests.get(url)
soup = BeautifulSoup(html_text.content, features = "lxml")
print(soup.prettify())
#scrappring html tags such as Title, Links, Publication date
for index,new in enumerate(news):
published_date = new.find('span',class_="article__time-stamp").text
title = new.find('h3',class_="article__title").text
link = new.find('a',class_="article__link").attrs['href']
print(f" publish_date: {published_date}")
print(f" title: {title}")
print(f" link: {link}")
result :
publish_date:
06-10-2020 | 20:53
title:
18 killed in bombing in Turkish-controlled Syrian town
link: https://www.annahar.com/english/section/186-mena/06102020061027020
My question is how to continue from here in order to enter each link and extract its content?
the expected result :
publish_date:
06-10-2020 | 20:53
title:
18 killed in bombing in Turkish-controlled Syrian town
link: https://www.annahar.com/english/section/186-mena/06102020061027020
description:
ANKARA: An explosives-laden truck ignited Tuesday on a busy street in a northern #Syrian town controlled by #Turkey-backed opposition fighters, killing at least 18 people and wounding dozens, Syrian opposition activists reported.
The blast in the town of al-Bab took place near a bus station where people often gather to travel from one region to another, according to the opposition’s Civil Defense, also known as White Helmets.
where the description exist inside the link
Add an additional request to your loop that gets to the article page and there grab the description
page = requests.get(link)
soup = BeautifulSoup(page.content, features = "lxml")
description = soup.select_one('div.articleMainText').get_text()
print(f" description: {description}")
Example
import requests
from bs4 import BeautifulSoup
url = "https://www.annahar.com/english/section/186-mena"
html_text = requests.get(url)
soup = BeautifulSoup(html_text.content, features = "lxml")
# print(soup.prettify())
#scrappring html tags such as Title, Links, Publication date
for index,new in enumerate(soup.select('div#listingDiv44083 div.article')):
published_date = new.find('span',class_="article__time-stamp").get_text(strip=True)
title = new.find('h3',class_="article__title").get_text(strip=True)
link = new.find('a',class_="article__link").attrs['href']
page = requests.get(link)
soup = BeautifulSoup(page.content, features = "lxml")
description = soup.select_one('div.articleMainText').get_text()
print(f" publish_date: {published_date}")
print(f" title: {title}")
print(f" link: {link}")
print(f" description: {description}", '\n')
You have to grab all follow links to the articles and then loop over that and grab the parts you're interested in.
Here's how:
import time
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(
requests.get("https://www.annahar.com/english/section/186-mena").content,
"lxml"
)
follow_links = [
a["href"] for a in
soup.find_all("a", class_="article__link")
if "#" not in a["href"]
]
for link in follow_links:
s = BeautifulSoup(requests.get(link).content, "lxml")
date_published = s.find("span", class_="date").getText(strip=True)
title = s.find("h1", class_="article-main__title").getText(strip=True)
article_body = s.find("div", {"id": "bodyToAddTags"}).getText()
print(f"{date_published} {title}\n\n{article_body}\n", "-" * 80)
time.sleep(2)
Output (shortened for brevity):
08-10-2020 | 12:35 Iran frees rights activist after more than 8 years in prison
TEHRAN: Iran has released a prominent human rights activist who campaigned against the death penalty, Iranian media reported Thursday.The semiofficial ISNA news agency quoted judiciary official Sadegh Niaraki as saying that Narges Mohammadi was freed late Wednesday after serving 8 1/2 years in prison. She was sentenced to 10 years in 2016 while already incarcerated.Niaraki said Mohammadi was released based on a law that allows a prison sentence to be commutated if the related court agrees.In July, rights group Amnesty International demanded Mohammadi’s immediate release because of serious pre-existing health conditions and showing suspected COVID-19 symptoms. The Thursday report did not refer to her possible illness.Mohammadi was sentenced in Tehran’s Revolutionary Court on charges including planning crimes to harm the security of Iran, spreading propaganda against the government and forming and managing an illegal group.She was in a prison in the northwestern city of Zanjan, some 280 kilometers (174 miles) northwest of the capital Tehran.Mohammadi was close to Iranian Nobel Peace Prize laureate Shirin Ebadi, who founded the banned Defenders of Human Rights Center. Ebadi left Iran after the disputed re-election of then-President Mahmoud Ahmadinejad in 2009, which touched off unprecedented protests and harsh crackdowns by authorities.In 2018, Mohammadi, an engineer and physicist, was awarded the 2018 Andrei Sakharov Prize, which recognizes outstanding leadership or achievements of scientists in upholding human rights.
--------------------------------------------------------------------------------
...

Beautiful soup: Extract text and urls from a list, but only under specific headings

I am looking to use Beautiful Soup to scrape the Fujitsu news update page: https://www.fujitsu.com/uk/news/pr/2020/
I only want to extract the information under the headings of the current month and previous month.
For a particular month (e.g. November), I am trying to extract into a list
the Title
the URL
the text
for each news briefing (so a list of lists).
My attempt so far is as follow (showing only previous month for simplicity):
today = datetime.datetime.today()
year_str = str(today.year)
current_m = today.month
previous_m = current_m - 1
current_m_str = calendar.month_name[current_m]
previous_m_str = calendar.month_name[previous_m]
URL = 'https://www.fujitsu.com/uk/news/pr/' + year_str + '/'
resp = requests.get(URL)
soup = BeautifulSoup(resp.text, 'lxml')
previous_m_body = soup.find('h3', text=previous_m_str)
if previous_m_body is not None:
for sib in previous_m_body.find_next_siblings():
if sib.name == "h3":
break
else:
previous_m_text = str(sib.text)
print(previous_m_text)
However, this generates one long string with newlines, and no separation between Title, text, url:
Fujitsu signs major contract with Scottish Government to deliver election e-Counting solution London, United Kingdom, November 30, 2020 - Fujitsu, a leading digital transformation company, has today announced a major contract with the Scottish Government and Scottish Local...
Fujitsu Introduces Ultra-Compact, 50A PCB Relay for Medium-to-Heavy Automotive Loads Hoofddorp, EMEA, November 11, 2020 - Fujitsu Components Europe has expanded its automotive relay offering with a new 12VDC PCB relay featuring.......
I have attached an image of the page DOM.
Try this:
import requests
from bs4 import BeautifulSoup
html = requests.get("https://www.fujitsu.com/uk/news/pr/2020/").text
all_lists = BeautifulSoup(html, "html.parser").find_all("ul", class_="filterlist")
news = []
for unordered_list in all_lists:
for list_item in unordered_list.find_all("li"):
news.append(
[
list_item.find("a").getText(),
f"https://www.fujitsu.com{list_item.find('a')['href']}",
list_item.getText(strip=True)[len(list_item.find("a").getText()):],
]
)
for news_item in news:
print("\n".join(news_item))
print("-" * 80)
Output (shortened for brevity):
Fujitsu signs major contract with Scottish Government to deliver election e-Counting solution
https://www.fujitsu.com/uk/news/pr/2020/fs-20201130.html
London, United Kingdom, November 30, 2020- Fujitsu, a leading digital transformation company, has today announced a major contract with the Scottish Government and Scottish Local Authorities to support the electronic counting (e-Counting) of ballot papers at the Scottish Local Government elections in May 2022.Fujitsu Introduces Ultra-Compact, 50A PCB Relay for Medium-to-Heavy Automotive LoadsHoofddorp, EMEA, November 11, 2020- Fujitsu Components Europe has expanded its automotive relay offering with a new 12VDC PCB relay featuring a switching capacity of 50A at 14VDC. The FBR53-HC offers a higher contact rating than its 40A FBR53-HW counterpart, yet occupies the same 12.1 x 15.5 x 13.7mm footprint and weighs the same 6g.
--------------------------------------------------------------------------------
and more ...
EDIT:
To get just the last two months, all you need is the first two ul items from the soup. So, add [:2] to the first for loop, like this:
for unordered_list in all_lists[:2]:
# the rest of the loop body goes here
here I modified your code. I combined your bs4 code with selenium. Selenium is very powerful for scrape dynamic or JavaScript based website. You can use selenium with BeautifulSoup for make your life easier. Now it will give you output for all months.
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Firefox()
driver.maximize_window()
url = "https://www.fujitsu.com/uk/news/pr/2020/" #change the url if you want to get result for different year
driver.get(url)
# now your bs4 code start. It will give you output from current month to previous all month
soup = BeautifulSoup(driver.page_source, "html.parser")
#here I am getting all month name from January to november.
months = soup.find_all('h3')
for month in months:
month = month.text
print(f"month_name : {month}\n")
#here we are getting all description text from current month to all previous months
description_texts = soup.find_all('ul',class_='filterlist')
for description_text in description_texts:
description_texts = description_text.text.replace('\n','')
print(f"description_text: {description_texts}")
output:

Scraping and parsing citation info from Google Scholar search results

I have a list of around 20000 article's titles and i want to scrape their citation count from google scholar. I am new to BeautifulSoup library. I have this code:
import requests
from bs4 import BeautifulSoup
query = ['Role for migratory wild birds in the global spread of avian
influenza H5N8','Uncoupling conformational states from activity in an
allosteric enzyme','Technological Analysis of the World’s Earliest
Shamanic Costume: A Multi-Scalar, Experimental Study of a Red Deer
Headdress from the Early Holocene Site of Star Carr, North Yorkshire,
UK','Oxidative potential of PM 2.5 during Atlanta rush hour:
Measurements of in-vehicle dithiothreitol (DTT) activity','Primary
Prevention of CVD','Growth and Deposition of Au Nanoclusters on Polymer-
wrapped Graphene and Their Oxygen Reduction Activity','Relations of
Preschoolers Visual-Motor and Object Manipulation Skills With Executive
Function and Social Behavior','We Know Who Likes Us, but Not Who Competes
Against Us']
url = 'https://scholar.google.com/scholar?q=' + query + '&ie=UTF-8&oe=UTF-
8&hl=en&btnG=Search'
content = requests.get(url).text
page = BeautifulSoup(content, 'lxml')
results = []
for entry in page.find_all("h3", attrs={"class": "gs_rt"}):
results.append({"title": entry.a.text, "url": entry.a['href']})
but it returns only title and url. i don't know how to get the citation information from another tag. Please help me out here.
You need to loop the list. You can use Session for efficiency. The below is for bs 4.7.1 which supports :contains pseudo class for finding the citation count. Looks like you can remove the h3 type selector from the css selector and just use class before the a i.e. .gs_rt a. If you don't have 4.7.1. you can use [title=Cite] + a to select citation count instead.
import requests
from bs4 import BeautifulSoup as bs
queries = ['Role for migratory wild birds in the global spread of avian influenza H5N8',
'Uncoupling conformational states from activity in an allosteric enzyme',
'Technological Analysis of the World’s Earliest Shamanic Costume: A Multi-Scalar, Experimental Study of a Red Deer Headdress from the Early Holocene Site of Star Carr, North Yorkshire, UK',
'Oxidative potential of PM 2.5 during Atlanta rush hour: Measurements of in-vehicle dithiothreitol (DTT) activity',
'Primary Prevention of CVD','Growth and Deposition of Au Nanoclusters on Polymer-wrapped Graphene and Their Oxygen Reduction Activity',
'Relations of Preschoolers Visual-Motor and Object Manipulation Skills With Executive Function and Social Behavior',
'We Know Who Likes Us, but Not Who Competes Against Us']
with requests.Session() as s:
for query in queries:
url = 'https://scholar.google.com/scholar?q=' + query + '&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search'
r = s.get(url)
soup = bs(r.content, 'lxml') # or 'html.parser'
title = soup.select_one('h3.gs_rt a').text if soup.select_one('h3.gs_rt a') is not None else 'No title'
link = soup.select_one('h3.gs_rt a')['href'] if title != 'No title' else 'No link'
citations = soup.select_one('a:contains("Cited by")').text if soup.select_one('a:contains("Cited by")') is not None else 'No citation count'
print(title, link, citations)
The alternative for < 4.7.1.
with requests.Session() as s:
for query in queries:
url = 'https://scholar.google.com/scholar?q=' + query + '&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search'
r = s.get(url)
soup = bs(r.content, 'lxml') # or 'html.parser'
title = soup.select_one('.gs_rt a')
if title is None:
title = 'No title'
link = 'No link'
else:
link = title['href']
title = title.text
citations = soup.select_one('[title=Cite] + a')
if citations is None:
citations = 'No citation count'
else:
citations = citations.text
print(title, link, citations)
Bottom version re-written thanks to comments from #facelessuser. Top version left for comparison:
It would probably be more efficient to not call select_one twice in single line if statement. While the pattern building is cached, the returned tag is not cached. I personally would set the variable to whatever is returned by select_one and then, only if the variable is None, change it to No link or No title etc. It isn't as compact, but it will be more efficient
[...]always check if if tag is None: and not just if tag:. With selectors, it isn't a big deal as they will only return tags, but if you ever do something like for x in tag.descendants: you get text nodes (strings) and tags, and an empty string will evaluate false even though it is a valid node. In that case, it is safest to to check for None
Instead of finding all <h3> tags, I suggest you to search for the tags enclosing both <h3> and the citation (inside <div class="gs_rs>"), i.e. find all <div class="gs_ri"> tags.
Then from these tags, you should be able to get all you need:
query = ['Role for migratory wild birds in the global spread of avian influenza H5N8','Uncoupling conformational states from activity in an allosteric enzyme','Technological Analysis of the World’s Earliest Shamanic Costume: A Multi-Scalar, Experimental Study of a Red Deer Headdress from the Early Holocene Site of Star Carr, North Yorkshire, UK','Oxidative potential of PM 2.5 during Atlanta rush hour: Measurements of in-vehicle dithiothreitol (DTT) activity','Primary Prevention of CVD','Growth and Deposition of Au Nanoclusters on Polymer- wrapped Graphene and Their Oxygen Reduction Activity','Relations of Preschoolers Visual-Motor and Object Manipulation Skills With Executive Function and Social Behavior','We Know Who Likes Us, but Not Who Competes Against Us']
url = 'https://scholar.google.com/scholar?q=' + query + '&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search'
content = requests.get(url).text
page = BeautifulSoup(content, 'lxml')
results = []
for entry in page.find_all("div", attrs={"class": "gs_ri"}): #tag containing both h3 and citation
results.append({"title": entry.h3.a.text, "url": entry.a['href'], "citation": entry.find("div", attrs={"class": "gs_rs"}).text})
Make sure you're using user-agent because default requests user-agent is python-requests and Google might block your requests and you receive a different HTML with some sort of error that doesn't contain selectors you're trying to select. Check what's your user-agent.
It also might be a good idea to rotate user-agents while making requests.
Code and full example that scrapes much more in the online IDE:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
queries = ['Role for migratory wild birds in the global spread of avian influenza H5N8',
'Uncoupling conformational states from activity in an allosteric enzyme',
'Technological Analysis of the World’s Earliest Shamanic Costume: A Multi-Scalar, Experimental Study of a Red Deer Headdress from the Early Holocene Site of Star Carr, North Yorkshire, UK',
'Oxidative potential of PM 2.5 during Atlanta rush hour: Measurements of in-vehicle dithiothreitol (DTT) activity',
'Primary Prevention of CVD','Growth and Deposition of Au Nanoclusters on Polymer-wrapped Graphene and Their Oxygen Reduction Activity',
'Relations of Preschoolers Visual-Motor and Object Manipulation Skills With Executive Function and Social Behavior',
'We Know Who Likes Us, but Not Who Competes Against Us']
for query in queries:
params = {
"q": query,
"hl": "en",
}
html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
# Container where all needed data is located
for result in soup.select('.gs_ri'):
title = result.select_one('.gs_rt').text
title_link = result.select_one('.gs_rt a')['href']
cited_by = result.select_one('#gs_res_ccl_mid .gs_nph+ a')['href']
cited_by_count = result.select_one('#gs_res_ccl_mid .gs_nph+ a').text.split(' ')[2]
print(f"{title}\n{title_link}\n{cited_by}\n{cited_by_count}\n")
Alternatively, you can achieve the same thing by using Google Scholar Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you only need to iterate over structured JSON and get the data you want, rather than figuring out why certain things don't work as they should.
Code to integrate:
from serpapi import GoogleSearch
import os
queries = ['Role for migratory wild birds in the global spread of avian influenza H5N8',
'Uncoupling conformational states from activity in an allosteric enzyme',
'Technological Analysis of the World’s Earliest Shamanic Costume: A Multi-Scalar, Experimental Study of a Red Deer Headdress from the Early Holocene Site of Star Carr, North Yorkshire, UK',
'Oxidative potential of PM 2.5 during Atlanta rush hour: Measurements of in-vehicle dithiothreitol (DTT) activity',
'Primary Prevention of CVD','Growth and Deposition of Au Nanoclusters on Polymer-wrapped Graphene and Their Oxygen Reduction Activity',
'Relations of Preschoolers Visual-Motor and Object Manipulation Skills With Executive Function and Social Behavior',
'We Know Who Likes Us, but Not Who Competes Against Us']
for query in queries:
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar",
"q": query,
}
search = GoogleSearch(params)
results = search.get_dict()
data = []
for result in results['organic_results']:
data.append({
'title': result['title'],
'link': result['link'],
'publication_info': result['publication_info']['summary'],
'snippet': result['snippet'],
'cited_by': result['inline_links']['cited_by']['link'],
'related_versions': result['inline_links']['related_pages_link'],
})
print(json.dumps(data, indent=2, ensure_ascii=False))
P.S - I wrote a blog post about how to scrape pretty much everything on Google Scholar with visual representation.
Disclaimer, I work for SerpApi.

How to scrape all review product using python

right now im doing scrape review product at this website
https://www.lazada.com.my/products/xiaomi-mi-a1-4gb-ram-32gb-rom-i253761547-s336359472.html?spm=a2o4k.searchlistcategory.list.64.71546883QBZiNT&search=1
i manage to get the review on first page only
import pandas as pd
from urllib.request import Request, urlopen as uReq #package web scraping
from bs4 import BeautifulSoup as soup
def make_soup(website) :
req = Request(website,headers = {'User-Agent' : 'Mozilla/5.0'})
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
return page_soup
lazada_url = 'https://www.lazada.com.my/products/xiaomi-mi-a1-4gb-ram-32gb-rom-i253761547-s336359472.html?spm=a2o4k.searchlistcategory.list.64.71546883QBZiNT&search=1'
website = make_soup(lazada_url)
news_headlines = pd.DataFrame( columns = ['reviews','sentiment','score'])
headlines = website.findAll('div',attrs={"class":"item-content"})
n = 0
for item in headlines :
top = item.div
#print(top)
#print()
text_headlines = top.text
print(text_headlines)
print()
n +=1
news_headlines.loc[n-1,'title'] = text_headlines
Result only first page.. How to do for all pages. there is no pages in the Url for me to loop.. you guys can check the url.. Thank You :)
I like this phone very much and it's global version. I recommend this phone for who like gaming. Delivery just took 3 days only. Thanks Lazada
Item was received in just two days and was wonderfully wrapped. Thanks for the excellent services Lazada!
Very happy with the phone. It's original, it arrived in good condition. Built quality is superb for a budget phone.
The delivery is very fast just take one day to reach at my home. However, the tax invoice is not attached. How do I get the tax invoice?
great deal from lazada. anyway, i do not find any tax invoice. please do email me the tax invoice. thank you.
You can scrape the pagination at the bottom of the reviews to find the minimum and maximum number of reviews:
import requests
from bs4 import BeautifulSoup as soup
def get_page_reviews(content:soup) -> dict:
rs = content.find('div', {'class':'mod-reviews'}).find_all('div', {'class':'item'})
reviews = [i.find('div', {'class':'item-content'}).find('div', {'class':'content'}).text for i in rs]
stars = [len(c.find('div', {'class':'top'}).find_all('img')) for c in rs]
_by = [i.find('div', {'class':'middle'}).find('span').text for i in rs]
return {'stars':stars, 'reviews':reviews, 'authors':_by}
d = soup(requests.get('https://www.lazada.com.my/products/xiaomi-mi-a1-4gb-ram-32gb-rom-i253761547-s336359472.html?spm=a2o4k.searchlistcategory.list.64.71546883QBZiNT&search=1').text, 'html.parser')
results = list(map(int, filter(None, [i.text for i in d.find_all('button', {'class':'next-pagination-item'})])))
for i in range(min(results), max(results)+1):
new_url = f'https://www.lazada.com.my/products/xiaomi-mi-a1-4gb-ram-32gb-rom-i253761547-s336359472.html?spm=a2o4k.searchlistcategory.list.64.71546883QBZiNT&search={i}'
#now, can use new_url to request the next page of reviews
r = get_page_reviews(soup(requests.get(new_url).text, 'html.parser'))
final_result = [{'stars':a, 'author':b, 'review':c} for a, b, c in zip(r['stars'], r['authors'], r['reviews'])]
Output (for first page):
[{'stars': 5, 'author': 'by Ridwan R.', 'review': "I like this phone very much and it's global version. I recommend this phone for who like gaming. Delivery just took 3 days only. Thanks Lazada"}, {'stars': 5, 'author': 'by Razli A.', 'review': 'Item was received in just two days and was wonderfully wrapped. Thanks for the excellent services Lazada!'}, {'stars': 5, 'author': 'by Nur F.', 'review': "Very happy with the phone. It's original, it arrived in good condition. Built quality is superb for a budget phone."}, {'stars': 5, 'author': 'by Muhammad S.', 'review': 'The delivery is very fast just take one day to reach at my home. However, the tax invoice is not attached. How do I get the tax invoice?'}, {'stars': 5, 'author': 'by Xavier Y.', 'review': 'great deal from lazada. anyway, i do not find any tax invoice. please do email me the tax invoice. thank you.'}]
What you need to do is just using click() method in Selenium.
Selenium is a portable software-testing framework for web applications that allows you to access the web and get the sources you want.
In the given URL, there are pages button for the review, so just find the buttons by xpath, class, id by using find_element_by_(anything you want).click(). This will lead you to next pages.
This is the sample code of mine :D
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import time
from selenium.webdriver.chrome.options import Options
url = 'https://www.lazada.com.my/products/xiaomi-mi-a1-4gb-ram-32gb- rom-i253761547-s336359472.html? spm=a2o4k.searchlistcategory.list.64.71546883QBZiNT&search=1'
chrome_options = Options()
#chrome_options.add_argument("--headless")
browser = webdriver.Chrome('/Users/baejihwan/Documents/chromedriver',
chrome_options=chrome_options)
browser.get(url)
time.sleep(0.1)
page_soup = soup(browser.page_source, 'html.parser')
headlines = page_soup.findAll('div',attrs={"class":"item-content"})
for item in headlines :
top = item.div
text_headlines = top.text
print(text_headlines)
browser.find_element_by_xpath('//* .[#id="module_product_review"]/div/div[3]/div[2]/div/div/button[2]').click()
page_soups = soup(browser.page_source, 'html.parser')
headline = page_soups.findAll('div',attrs={"class":"item-content"})
for item in headline:
top = item.div
text_headlines = top.text
print(text_headlines)
Output:
I like this phone very much and it's global version. I recommend this phone for who like gaming. Delivery just took 3 days only. Thanks Lazada
Item was received in just two days and was wonderfully wrapped. Thanks for the excellent services Lazada!
Very happy with the phone. It's original, it arrived in good condition. Built quality is superb for a budget phone.
The delivery is very fast just take one day to reach at my home. However, the tax invoice is not attached. How do I get the tax invoice?
great deal from lazada. anyway, i do not find any tax invoice. please do email me the tax invoice. thank you.
Penghantaran cepat. Order ahad malam, sampai rabu pagi. Tu pun sbb selasa cuti umum.
Fon disealed dgn bubble wrap dan box.
Dah check mmg original malaysia.
Dpt free tempered glass. Ok je.
Fon so far pakai ok.
Selama ni pakai iphone, bila pakai android ni kekok sikit.
invoice tidak disertakan.
battery dia dikira cpt juga hbs..
Saya telah beli smartphone xioami mi a1 dan telah terima hari ni. Tetapi telefon itu telah rosak. Tidak dapat on.
beli pada 1/6 dgn harga rm599 dpt free gift usb otg type c 64gb jenama sandisk.
delivery pantas, order 1/6 sampai 4/6 tu pon sebab weekend ja kalau x mesti order harini esk sampai dah.
packaging terbaik, dalam kotak ada air bag so memang secure.
kotak fon sealed, dlm kotak dapat screen protector biasa free, kabel type c dgn charger 3 pin.
keluar kotak terus update ke Android oreo, memang puas hati la overall. memang berbaloi sangat beli. Kudos to lazada.
i submitted the order on on sunday and i get it tuesday morning, even the despatch guy called me at 830am just to make sure if im already at the office. super reliable. for the phone, well i got it for RM599. what could you possibly asked for more? hehehe
Purchased Xiaomi Mi A1 from Official store with an offer of "Free gift SanDisk Ultra 64GB Dual USB Drive 3.0 OTG Type C Flash Drive". But they delivered only USB drive 2.0
I've tried it in extremely naive way! It will be better to define a function that reads in html codes and parse the data you want. This code only parse the review to page 2, and you can modify it to get all the reviews to the end! :D If you have questions about this code, please leave a comment!
Hope this helps!

Cannot scrape specific content from site - BeautifulSoup 4

I am having hard luck scraping this link via Python 3, BeautifulSoup 4
http://www.radisson.com/lansing-hotel-mi-48933/lansing/hotel/dining
I only want to get this section.
When you are in ...
Capitol City Grille
This downtown Lansing restaurant offers ...
Capitol City Grille Lounge
For a glass of wine or a ...
Room Service
If you prefer ...
I have this code
for rest in dining_page_soup.select("div.copy_left p strong"):
if rest.next_sibling is not None:
if rest.next_sibling.next_sibling is not None:
title = rest.text
desc = rest.next_sibling.next_sibling
print ("Title: "+title)
print (desc)
But it gives me TypeError: 'NoneType' object is not callable
on desc = rest.next_sibling.next_sibling even I have an if statement to check whether it is None or not.
Here it is a very simple solution
from bs4 import BeautifulSoup
import requests
r = requests.get("http://www.radisson.com/lansing-hotel-mi-48933/lansing/hotel/dining")
data = r.text
soup = BeautifulSoup(data)
for found_text in soup.select('div.copy_left'):
print found_text.text
UPDATE
According to an improvement of the question, here it is a solution using RE.
A specific workaround have to be made for the 1st paragraph "When you..." since it does not respect the structure of other paragraphs.
for tag in soup.find_all(re.compile("^strong")):
title = tag.text
desc = tag.next_sibling.next_sibling
print ("Title: "+title)
print (desc)
Output
Title: Capitol City Grille
This downtown Lansing restaurant offers delicious, contemporary
American cuisine in an upscale yet relaxed environment. You can enjoy
dishes that range from fluffy pancakes to juicy filet mignon steaks.
Breakfast and lunch buffets are available, as well as an à la carte
menu.
Title: Capitol City Grille Lounge
For a glass of wine or a hand-crafted cocktail and great conversation,
spend an afternoon or evening at Capitol City Grille Lounge with
friends or colleagues.
Title: Room Service
If you prefer to dine in the comfort of your own room, order from the
room service menu.
Title: Menus
Breakfast Menu
Title: Capitol City Grille Hours
Breakfast, 6:30-11 a.m.
Title: Capitol City Grille Lounge Hours
Mon-Thu, 11 a.m.-11 p.m.
Title: Room Service Hours
Daily, 6:30 a.m.-2 p.m. and 5-10 p.m.
If you don't mind using xpath, this should work
import requests
from lxml import html
url = "http://www.radisson.com/lansing-hotel-mi-48933/lansing/hotel/dining"
page = requests.get(url).text
tree = html.fromstring(page)
xp_t = "//*[#class='copy_left']/descendant-or-self::node()/strong[not(following-sibling::a)]/text()"
xp_d = "//*[#class='copy_left']/descendant-or-self::node()/strong[not(following-sibling::a)]/../text()[not(following-sibling::strong)]"
titles = tree.xpath(xp_t)
descriptions = tree.xpath(xp_d) # still contains garbage like '\r\n'
descriptions = [d.strip() for d in descriptions if d.strip()]
for t, d in zip(titles, descriptions):
print("{title}: {description}".format(title=t, description=d))
Here descriptions contains 3 elements: "This downtown...", "For a glass...", "If you prefer...".
If you need also "When you are in the mood...", replace with this:
xp_d = "//*[#class='copy_left']/descendant-or-self::node()/strong[not(following-sibling::a)]/../text()"

Categories

Resources