How to write all webscraping data in html file using python? - python

import requests
from bs4 import BeautifulSoup
url = "https://gk-hindi.in/gk-questions?page="
i = 1
while i <= 48:
req = requests.get(url+str(i))
soup = BeautifulSoup(req.content, "html.parser")
mydivs = soup.find("div", {"class": "question-wrapper"})
print(mydivs)
with open("output.html", "w", encoding = 'utf-8') as file:
file.write(str(mydivs))
i = i+1
I wanted to save all loop data in a single HTML file but my code save only last loop data.

Open the output file in append mode a (assuming that the file already exists, otherwise set w mode) before webscraping:
url = "https://gk-hindi.in/gk-questions?page="
with open("output.html", "a", encoding = 'utf-8') as file:
for i in range(1, 49):
req = requests.get(url + str(i))
soup = BeautifulSoup(req.content, "html.parser")
mydivs = soup.find("div", {"class": "question-wrapper"})
file.write(str(mydivs))

You have to open the file in append mode
with open("output.html", "a", encoding = 'utf-8') as file:
pass

Related

Unable to get the desired web link at the time of web scraping

I want to do web scraping using BeautifulSoup4 but it doesn't work to get the links I want. How can I get the link I want when doing web scrapping?
url_web = {
"cnn" : "https://www.cnnindonesia.com/search/?query=citayam&page=",
"detik" : "https://www.detik.com/search/searchall?query=citayam&siteid=2",
"kompas" : "https://search.kompas.com/search/?q=citayam&submit=Submit"
}
list_cnn = []
for i in range(1, 33):
URL = url_web['cnn']+str(i)
print(i, '/',1, ' - ', URL)
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup)
results = soup.find("div", class_="media_rows")
for result in results.find_all("a"):
print(result)
href_elem = result.get('href')
list_cnn.append(href_elem)
print(list_cnn)
root_path = 'gdrive/My Drive/analisa_cfw/'
with open(root_path+'list_cnn.json', "w", encoding='utf8') as outfile:
json.dump(list_cnn, outfile, ensure_ascii=False)
print("Tokenized_sent json saved!")
Try replacing this section of your code here:
results = soup.find("div", class_="media_rows")
for result in results.find_all("a"):
print(result)
href_elem = result.get('href')
list_cnn.append(href_elem)
with this here:
results = soup.find_all("article")
for result in results:
try:
a = result.find('a')
href_elem = a.get('href')
print(href_elem)
list_cnn.append(href_elem)
except:
pass
complete code here:
import json
import requests
from bs4 import BeautifulSoup
url_web = {
"cnn": "https://www.cnnindonesia.com/search/?query=citayam&page=",
"detik": "https://www.detik.com/search/searchall?query=citayam&siteid=2",
"kompas": "https://search.kompas.com/search/?q=citayam&submit=Submit"
}
list_cnn = []
for i in range(1, 33):
URL = url_web['cnn']+str(i)
print(i, '/', 1, ' - ', URL)
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all("article")
for result in results:
try:
a = result.find('a')
href_elem = a.get('href')
print(href_elem)
list_cnn.append(href_elem)
except:
pass
root_path = 'gdrive/My Drive/analisa_cfw/'
with open(root_path+'list_cnn.json', "w", encoding='utf8') as outfile:
json.dump(list_cnn, outfile, ensure_ascii=False)
print("Tokenized_sent json saved!")

How do I ensure that BeautifulSoup does not look at commas as tabs

I have created a scraping code to take information from a local newspaper site. I have two existing problems with the current code.
When it retrieves the paragraph data, and saves it to the CSV. it recognises "," as breaks and save the relevant data in the adjacent cell. How do I stop this from happening?
I want them to the scraped information in rows. i.e paragraph, title, weblink
Code below;
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "https://neweralive.na/today/"
ne_url = "https://neweralive.na/posts/"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
containers = page_soup.findAll("article", {"class": "post-item"})
filename = "newera.csv"
headers = "paragraph,title,link\n"
f = open(filename, "w")
f.write(headers)
for container in containers:
paragraph_container = container.findAll("p", {"class": "post-excerpt"})
paragraph = paragraph_container[0].text
title_container = container.findAll("h3", {"class": "post-title"})
title = title_container[0].text
weblink = ne_url + title_container[0].a["href"]
f.write(paragraph + "," + title + "," + weblink + "\n")
f.close()
You can use the built-in csv module to write well-formed CSV with quotes around strings that need those (e.g. those containing commas).
While at it, I refactored your code to use reusable functions:
get_soup_from_url() downloads an URL and gets a BeautifulSoup out of it
parse_today_page() is a generator function that can walk through that soup and return dicts of each article
The main code now just uses csv.DictWriter on the opened file; the dicts parsed are printed to the console for debugging ease and fed to the CSV writer for output.
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
base_url = "https://neweralive.na/posts/"
def get_soup_from_url(url):
resp = urlopen(url)
page_soup = BeautifulSoup(resp.read(), "html.parser")
resp.close()
return page_soup
def parse_today_page(page_soup):
for container in page_soup.findAll("article", {"class": "post-item"}):
paragraph_container = container.findAll(
"p", {"class": "post-excerpt"}
)
paragraph = paragraph_container[0].text
title_container = container.findAll("h3", {"class": "post-title"})
title = title_container[0].text
weblink = base_url + title_container[0].a["href"]
yield {
"paragraph": paragraph,
"title": title.strip(),
"link": weblink,
}
print("Downloading...")
page_soup = get_soup_from_url("https://neweralive.na/today/")
with open("newera.csv", "w") as f:
writer = csv.DictWriter(f, ["paragraph", "title", "link"])
writer.writeheader()
for entry in parse_today_page(page_soup):
print(entry)
writer.writerow(entry)
The generated CSV ends up looking like e.g.
paragraph,title,link
"The mayor of Helao Nafidi, Elias Nghipangelwa, has expressed disappointment after Covid-19 relief food was stolen and sold by two security officers entrusted to guard the warehouse where the food was stored.","Guards arrested for theft of relief food",https://neweralive.na/posts/posts/guards-arrested-for-theft-of-relief-food
"Government has decided to construct 1 200 affordable homes, starting Thursday this week.","Govt to construct 1 200 low-cost houses",https://neweralive.na/posts/posts/govt-to-construct-1-200-low-cost-houses
...
You can use pandas module and convert dataframe-table to csv easily.
import pandas as pd
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "https://neweralive.na/today/"
ne_url = "https://neweralive.na/posts/"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
containers = page_soup.findAll("article", {"class": "post-item"})
filename = "newera.csv"
rows = [] # Initialize list of list which is converted to dataframe.
for container in containers:
paragraph_container = container.findAll("p", {"class": "post-excerpt"})
paragraph = paragraph_container[0].text
title_container = container.findAll("h3", {"class": "post-title"})
title = title_container[0].text
weblink = ne_url + title_container[0].a["href"]
rows.append([paragraph, title, weblink]) # each row is appended
df = pd.DataFrame(rows, columns = ["paragraph","title","link"]) # col-name is headers
df.to_csv(filename, index=None)

Python HTML Parser Pagination

I'm new to python and have managed to get this far trying the HTML Parser, but I'm stuck on how to get pagination for the reviews at the bottom of the page to work for the site.
The URL is in the PasteBin code, I am leaving out the URL in this thread for privacy reasons.
Any help is much appreciated.
# Reviews Scrape
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'EXAMPLE.COM'
# opening up connection, grabbing, the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# HTML Parsing
page_soup = soup(page_html, "html.parser")
# Grabs each review
reviews = page_soup.findAll("div",{"class":"jdgm-rev jdgm-divider-top"})
filename = "compreviews.csv"
f = open(filename, "w")
headers = "Score, Title, Content\n"
f.write(headers)
# HTML Lookup Location per website and strips spacing
for container in reviews:
# score = container.div.div.span["data-score"]
score = container.findAll("span",{"data-score":True})
user_score = score[0].text.strip()
title_review = container.findAll("b",{"class":"jdgm-rev__title"})
user_title = title_review[0].text.strip()
content_review = container.findAll("div",{"class":"jdgm-rev__body"})
user_content = content_review[0].text.strip()
print("user_score:" + score[0]['data-score'])
print("user_title:" + user_title)
print("user_content:" + user_content)
f.write(score[0]['data-score'] + "," +user_title + "," +user_content + "\n")
f.close()
The page does an xhr GET request using a query string to get results. This query string has parameters for reviews per page and page number. You can make an initial request with what seems like the max reviews per page of 31, extract the html from the json returned then grab the page count; write a loop to run over all pages getting results. Example construct below:
import requests
from bs4 import BeautifulSoup as bs
start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
with requests.Session() as s:
r = s.get(start_url).json()
soup = bs(r['html'], 'lxml')
print([i.text for i in soup.select('.jdgm-rev__author')])
print([i.text for i in soup.select('.jdgm-rev__title')])
total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
for page in range(2, total_pages + 1):
r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
soup = bs(r['html'], 'lxml')
print([i.text for i in soup.select('.jdgm-rev__author')])
print([i.text for i in soup.select('.jdgm-rev__title')]) #etc
Example dataframe to csv
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
authors = []
titles = []
with requests.Session() as s:
r = s.get(start_url).json()
soup = bs(r['html'], 'lxml')
authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
titles.extend([i.text for i in soup.select('.jdgm-rev__title')])
total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
for page in range(2, total_pages + 1):
r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
soup = bs(r['html'], 'lxml')
authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
titles.extend([i.text for i in soup.select('.jdgm-rev__title')]) #etc
headers = ['Author','Title']
df = pd.DataFrame(zip(authors,titles), columns = headers)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )

python webscraping and write data into csv

I'm trying to save all the data(i.e all pages) in single csv file but this code only save the final page data.Eg Here url[] contains 2 urls. the final csv only contains the 2nd url data.
I'm clearly doing something wrong in the loop.but i dont know what.
And also this page contains 100 data points. But this code only write first 44 rows.
please help this issue.............
from bs4 import BeautifulSoup
import requests
import csv
url = ["http://sfbay.craigslist.org/search/sfc/npo","http://sfbay.craigslist.org/search/sfc/npo?s=100"]
for ur in url:
r = requests.get(ur)
soup = BeautifulSoup(r.content)
g_data = soup.find_all("a", {"class": "hdrlnk"})
gen_list=[]
for row in g_data:
try:
name = row.text
except:
name=''
try:
link = "http://sfbay.craigslist.org"+row.get("href")
except:
link=''
gen=[name,link]
gen_list.append(gen)
with open ('filename2.csv','wb') as file:
writer=csv.writer(file)
for row in gen_list:
writer.writerow(row)
the gen_list is being initialized again inside your loop that runs over the urls.
gen_list=[]
Move this line outside the for loop.
...
url = ["http://sfbay.craigslist.org/search/sfc/npo","http://sfbay.craigslist.org/search/sfc/npo?s=100"]
gen_list=[]
for ur in url:
...
i found your post later, wanna try this method:
import requests
from bs4 import BeautifulSoup
import csv
final_data = []
url = "https://sfbay.craigslist.org/search/sss"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
get_details = soup.find_all(class_="result-row")
for details in get_details:
getclass = details.find_all(class_="hdrlnk")
for link in getclass:
link1 = link.get("href")
sublist = []
sublist.append(link1)
final_data.append(sublist)
print(final_data)
filename = "sfbay.csv"
with open("./"+filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter = ",")
csvfile.writerow("")
for i in range(0, len(final_data)):
csvfile.writerow(final_data[i])

How do i get my Beautiful soup output data to a text file?

How do i get my Beautiful soup output data to a text file?
Here's the code;
import urllib2
from bs4 import BeautifulSoup
url = urllib2.urlopen("http://link").read()
soup = BeautifulSoup(url)
file = open("parseddata.txt", "wb")
for line in soup.find_all('a', attrs={'class': 'book-title-link'}):
print (line.get('href'))
file.write(line.get('href'))
file.flush()
file.close()
file.close should be called once (after the for loop):
import urllib2
from bs4 import BeautifulSoup
url = urllib2.urlopen("http://link").read()
soup = BeautifulSoup(url)
file = open("parseddata.txt", "wb")
for line in soup.find_all('a', attrs={'class': 'book-title-link'}):
href = line.get('href')
print href
if href:
file.write(href + '\n')
file.close()
UPDATE you can use href=True to avoid if statement. In addition to it, using with statement, you don't need to close the file object manually:
import urllib2
from bs4 import BeautifulSoup
content = urllib2.urlopen("http://link").read()
soup = BeautifulSoup(content)
with open('parseddata.txt', 'wb') as f:
for a in soup.find_all('a', attrs={'class': 'book-title-link'}, href=True):
print a['href']
f.write(a['href'] + '\n')
I just do this:
with open('./output/' + filename + '.html', 'w+') as f:
f.write(temp.prettify("utf-8"))
temp is the html that is prased by beautifulsoup.

Categories

Resources