I cannot separate scraped data into different category - python

I need help from python expert. This is a site where I have to scrape table data and separate into four different category then convert it into excel file but problem is all table category's classes are same.
There should be different four classes but same four classes
Thanks
Mariful
Website for scrape
import requests
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd
url = "https://www.kpaa.or.kr/kpaa/eng/list.do?"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
items = soup.find_all(class_='title')
for item in items:
n = item.text
print(n)
df = pd.Dataframe({'name':n, 'office':n, 'phone':n, 'email':n})

Here is i try to convert single data to 2D list to use in data pandas data frame.
from bs4 import BeautifulSoup
import csv
import re
import pandas as pd
url = "https://www.kpaa.or.kr/kpaa/eng/list.do?"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
data_list = [td.getText(strip=True, separator=',').split(',') for td in soup.find('div', {'class':'cont_box2'}).find_all('tr')[:-1]]
df = pd.DataFrame(data_list)
df.to_excel('x.xlsx')

Related

Web scraping in python; Output to excel returns HTML instead of the data frame

I'm new to python and practicing web scraping. I was writing a code to get all the movie names and the the consecutive year of the movies. I got the result into a dataframe, but when i am exporting to excel, its showing as the html code. Thanks in advance.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os
pages=np.arange(1,2,1)
pwd = os.getcwd()
yify = pd.DataFrame (columns = ['Title', 'Year'])
#looping through different pages
for page in pages:
page='https://yts.mx/browse-movies?page='+str(page)
data=requests.get(page).text
soup = BeautifulSoup(data,'html.parser')
#looping through all the movie names and years
for row in soup.find_all('div', class_ = 'browse-movie-bottom'):
title=row.find('a', class_ = 'browse-movie-title')
years=row.find('div', class_ = 'browse-movie-year')
yify=yify.append({'Title': title, 'Year':years},ignore_index=True)
yify.head()
output.to_excel(pwd + '\\Yify_Test_Output.xlsx', index=False )
Try:
for row in soup.find_all('div', class_ = 'browse-movie-bottom'):
title=row.find('a', class_ = 'browse-movie-title').text.strip()
years=row.find('div', class_ = 'browse-movie-year').text.strip()
As a short explanation: the OP was retrieving the actual HTML element, including HTML tags. You extract the text from the element with <html_element>.text.

How to get data from website into Pandas dataframe

I need to write a Python script that will use data from a website like this link and put it into a pandas dataframe.
My try is
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
url = "https://www.w3schools.com/sql/trysql.asp?filename=trysql_op_in"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
league_table = soup.find('table', class_ = "table.ws-table-all.notranslate")
print(league_table)
And I got output: None

Python html parsing using beautiful soup issues

I am trying to get the name of all organizations from https://www.devex.com/organizations/search using beautifulsoup.However, I am getting an error. Can someone please help.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
headers = {"Accept-Language": "en-US,en;q=0.5"}
titles = []
pages = np.arange(1, 2, 1)
for page in pages:
page = requests.get("https://www.devex.com/organizations/search?page%5Bnumber%5D=" + str(page) + "", headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
movie_div = soup.find_all('div', class_='info-container')
sleep(randint(2,10))
for container in movie_div:
name = container.a.find('h3', class_= 'ng-binding').text
titles.append(name)
movies = pd.DataFrame({
'movie': titles,
})
to see your dataframe
print(movies)
to see the datatypes of your columns
print(movies.dtypes)
to see where you're missing data and how much data is missing
print(movies.isnull().sum())
to move all your scraped data to a CSV file
movies.to_csv('movies.csv')
you may try with something like
name = bs.find("h3", {"class": "ng-binding"})

Issues writing data scraper

I have to make a code in order to scrape datafrom a website and then analyse them for university.
My problem is that I made this code in order to get some data for all products but when I run it it only shows a single response for each variable.
Can you help me resolve this error ?
from bs4 import BeautifulSoup as soup
import urllib
from urllib.request import urlopen as uReq
import requests
myurl='https://boutique.orange.fr/mobile/choisir-un-mobile'
Uclient=uReq(myurl)
page=Uclient.read()
Uclient.close()
pagesoup=soup(page,'html.parser')
containers=pagesoup.findAll('div',{'class':'box-prod pointer'})
container=containers[0]
produit=container.img['alt']
price=container.findAll('span',{'class':'price'})
price2=container.findAll('div',{'class':'prix-seul'})
avis=container.footer.div.a.img['alt']
file="orange.csv"
f=open(file,'w')
headers='produit,prix avec abonnement, prix seul, avis\n'
f.write(headers)
for container in containers:
produit=container.img['alt']
price=container.findAll('span',{'class':'price'})
price2=container.findAll('div',{'class':'prix-seul'})
avis=container.footer.div.a.img['alt']
You could use different selectors. Separate two prices per product by index. Extract price specific info using join and findall.
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://boutique.orange.fr/mobile/choisir-un-mobile'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
#print(len(soup.select('#resultat .box-prod.pointer')))
p = re.compile('[0-9,€]+')
altText= [item.get('alt').strip() for item in soup.select('#resultat .box-prod.pointer .lazy')]
titles = [item.text.strip().replace('\n', ' ') for item in soup.select('#resultat .box-prod.pointer .titre-produit')]
allPrices = [''.join(p.findall(item.text)) for item in soup.select('#resultat span.price')]
aPartirPrice = allPrices[0::2]
prixSeul = allPrices[1::2]
items = list(zip(titles, altText, aPartirPrice, prixSeul))
df = pd.DataFrame(items,columns=['title', 'altText', 'aPartirPrice', 'prixSeul'])
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )
Transpose with:
df = df.T

Creating Pandas Dataframe from WebScraping Results

I am trying to scrape a table from espn and send the data to a pandas dataframe in order to export it to excel. I have completed most of the scraping, but am getting stuck on how to send each 'td' tag to a unique dataframe cell within my for loop. (Code is below) Any thoughts? Thanks!
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import os
import csv
import pandas as pd
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
soup = make_soup("http://www.espn.com/nba/statistics/player/_/stat/scoring-
per-game/sort/avgPoints/qualified/false")
regex = re.compile("^[e-o]")
for record in soup.findAll('tr', {"class":regex}):
for data in record.findAll('td'):
print(data)
I was actually recently scraping sports websites working on a daily fantasy sports algorithm for a class. This is the script I wrote up. Perhaps this approach can work for you. Build a dictionary. Convert it to a dataframe.
url = http://www.footballdb.com/stats/stats.html?lg=NFL&yr={0}&type=reg&mode={1}&limit=all
result = requests.get(url)
c = result.content
# Set as Beautiful Soup Object
soup = BeautifulSoup(c)
# Go to the section of interest
tables = soup.find("table",{'class':'statistics'})
data = {}
headers = {}
for i, header in enumerate(tables.findAll('th')):
data[i] = {}
headers[i] = str(header.get_text())
table = tables.find('tbody')
for r, row in enumerate(table.select('tr')):
for i, cell in enumerate(row.select('td')):
try:
data[i][r] = str(cell.get_text())
except:
stat = strip_non_ascii(cell.get_text())
data[i][r] = stat
for i, name in enumerate(tables.select('tbody .left .hidden-xs a')):
data[0][i] = str(name.get_text())
df = pd.DataFrame(data=data)

Categories

Resources