I created a script to pull data from a website and paste it in excel. The script works, however when I extract the Name from the website and it paste in excel for some reason its very small. But when you click in the cell its normal size. See image here.
I tried to show a snip it as an example but stack won't allow me.
Here is the current code
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import requests
# Fetch the HTML page
url = 'https://esearch.mobilecopropertytax.com/Property/View/466089'
response = requests.get(url)
html = response.text
# Parse the HTML page
soup = BeautifulSoup(html, 'lxml')
# Find the element containing the Parcel Number
element = soup.find('th', text='Parcel Number:')
# Extract the Parcel Number from the element
parcel_number = element.find_next_sibling().text
# Find the element containing the Name
element = soup.find('th', text='Name:')
# Extract the Name from the element
name = element.find_next_sibling().text
# Load the workbook
wb = load_workbook(r'C:\Users\user\EJW Test\EJWtest.xlsx')
# Select the sheet
ws = wb['Justification Worksheet']
# Select the cells
cell1 = ws['C7'] # Parcel Number
cell2 = ws['C10'] # Name
# Set the values of the cells
cell1.value = parcel_number
cell2.value = name
# Save the workbook
wb.save('completetest.xlsx')
Unsure why it puts the parcel info in just fine but inserts the name into excel and it looks completely different.
Related
I am trying to get multiple image url for each product in a single row. Assume I have an product which have 10 image url and I want to get all those image url in a single row but now I am seeing 10 row for 10 image url. here is my code:
box = soup.find_all('div',{'class':'row product-container'})
for i in box:
image = i.select('.carousel-image-wrapper img')
for i in image:
image_link = i['src']
print(image_link)
with open("image_src.csv", "a",encoding="utf-8") as f:
writeFile = csv.writer(f)
writeFile.writerow([image_src])
website_link
my python shell result:
https://images.guns.com/prod/ENM9F4JTvCd8689Sj0lLCoINZaip886IXvGur34a.png?imwidth=900
https://images.guns.com/prod/hrHyuVKf00K9FLOZbWLDrhD8nPrmclhhbtsBGCng.png?imwidth=900
https://images.guns.com/prod/CYgxJ0MFO5QFYzykRkTFyyuPp1wdhOAdyIrdhPYS.png?imwidth=900
https://images.guns.com/prod/hlkLmozTLHocAfd4soS8KIYUw82EXp1f8fBJao6k.png?imwidth=900
https://images.guns.com/prod/rBbDfuJatu05z23Wf4dP6rAQygo1gut6miQbPyGk.png?imwidth=900
https://images.guns.com/prod/0323qYoH0ughOICdbMjg6ljsRqD5M2TqGRDbojPG.png?imwidth=900
>>>
This product have 6 image url and I am seeing 6 row for 6 image url in my csv file. I want to combine this 6 url in a single row separated by comma.
You can do like this.
Create a list - image_links.
Add all the image urls to that list.
At the end, write that list to the CSV file.
Here is the code
import csv
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.guns.com/firearms/shotguns/semi-auto/beretta-a400-xtreme-plus-lh-ko-realtree-max-5-12-gauge-semi-auto-2-1-rounds-28-barrel-new?p=49038&soldout=1')
soup = BeautifulSoup(r.text, 'lxml')
box = soup.find_all('div',{'class':'row product-container'})
image_links = []
for i in box:
image = i.select('.carousel-image-wrapper img')
for i in image:
image_links.append(i['src'])
with open("image_src.csv", "a",encoding="utf-8") as f:
writeFile = csv.writer(f)
writeFile.writerow(image_links)
I used python 3 and pandas to parse the daily close from WSJ into EXCEL. However, the daily close shown on the web page screen cannot be extracted. Here is the link: "https://quotes.wsj.com/index/COMP/historical-prices"
How to download the close data on screen into excel?
and how to download "DOWNLOAD A SPREADSHEET" button file into excel with another name like comp.xlxs ?
Here are the codes:
import requests
import pandas as pd
url = 'https://quotes.wsj.com/index/COMP/historical-prices'
jsonData = requests.get(url).json()
final_df = pd.DataFrame()
for row in jsonData['data']:
#row = jsonData['data'][1]
data_row = []
for idx, colspan in enumerate(row['colspan']):
colspan_int = int(colspan[0])
data_row.append(row['td'][idx] * colspan_int)
flat_list = [item for sublist in data_row for item in sublist]
temp_row = pd.DataFrame([flat_list])
final_df = final_df.append(temp_row, sort=True).reset_index(drop=True)
wait2 = input("PRESS ENTER TO CONTINUE.")
Follow UP question quotes:
#
url = 'https://quotes.wsj.com/index/HK/XHKG/HSI/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('HSI.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\HSI.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\HSI.xlsx', index = None, header=True)
#
url = 'https://quotes.wsj.com/index/SPX/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('SPX.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\SPX.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\SPX.xlsx', index = None, header=True)
#
url = 'https://quotes.wsj.com/index/COMP/historical-prices/download?num_rows=15&range_days=15&endDate=12/06/2019'
response = requests.get(url)
open('COMP.csv', 'wb').write(response.content)
read_file = pd.read_csv (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\COMP.csv')
read_file.to_excel (r'C:\A-CEO\REPORTS\STOCKS\PROFILE\Python\COMP.xlsx', index = None, header=True)
the URL is wrong; once downloaded you can do "Get Info" if on a Mac, and you'll see "Where From:". You will see it's of the form below.
import requests
import pandas as pd
import io
#original URL had a bunch of other parameters I omitted, only these seem to matter but YMMV
url = 'https://quotes.wsj.com/index/COMP/historical-prices/download?num_rows=360&range_days=360&endDate=11/06/2019'
response = requests.get(url)
#do this if you want the CSV written to your machine
open('test_file.csv', 'wb').write(response.content)
# this decodes the content of the downloaded response and presents it to pandas
df_test = pd.read_csv(io.StringIO(response.content.decode('utf-8')))
To answer your additional question -- you can simply loop across a list of tickers or symbols, something like:
base_url = 'https://quotes.wsj.com/index/{ticker_name}/historical-prices/download?num_rows=360&range_days=360&endDate=11/06/2019'
ticker_list = ['COMP','SPX','HK/XHKG/HSI']
for ticker in ticker_list:
response = requests.get(base_url.format(ticker_name = ticker))
#do this if you want the CSV written to your machine
open('prices_'+ticker.replace('/','-')+'.csv', 'wb').write(response.content)
Note for HK/XHKG/HSI, we need to replace the slashes with hyphens or it's not a valid filename. You can also use this pattern to make dataframes.
I'm trying to get titles from old website.
The problem that i'm getting in some cases - null value.
Therefore, I have tried to do a while loop and change the URL.
Is my While loop in the right place?
The procedure is like this:
open file
get url
check url
get title
print title
while title = null):
replace part of the url and check url again
from urllib.request import urlopen
from bs4 import BeautifulSoup
from openpyxl import Workbook
import os
import xlrd
import lxml
# set file location
os.chdir("/excel_files")
# set the name of the file
file_name = "old.xlsx"
# open workbook
workbook = xlrd.open_workbook(file_name)
# set existing worksheet
sheet = workbook.sheet_by_index(0)
temp_list = [20131022212405,20090127003537,2009012702352,]
for i in range(sheet.nrows):
try:
u = sheet.cell_value(i,1)
html = urlopen(u)
bsObj = BeautifulSoup(html.read(), features='lxml')
# get title
title = str(bsObj.title)
print('row no. ',i, 'title is :' , title)
except:
title = 'null'
while (title == 'null'):
try:
u = u.replace(temp_list[i], temp_list[i + 1])
html = urlopen(u)
bsObj = BeautifulSoup(html.read(), features='lxml')
title = str(bsObj.title)
except:
print('title is :',title)
I'm getting null all the time - instead of getting only the row that actually is null.
It looks like your try/except indentation in the first for loop (for i in range(sheet.nrows):) is wrong, try and except should be on the same level.
I'm trying to scrape a forum discussion and export it as a csv file, with rows such as "thread title", "user", and "post", where the latter is the actual forum post from each individual.
I'm a complete beginner with Python and BeautifulSoup so I'm having a really hard time with this!
My current problem is that all the text is split into one character per row in the csv file. Is there anyone out there who can help me out? It would be fantastic if someone could give me a hand!
Here's the code I've been using:
from bs4 import BeautifulSoup
import csv
import urllib2
f = urllib2.urlopen("https://silkroad5v7dywlc.onion.to/index.php?action=printpage;topic=28536.0")
soup = BeautifulSoup(f)
b = soup.get_text().encode("utf-8").strip() #the posts contain non-ascii words, so I had to do this
writer = csv.writer(open('silkroad.csv', 'w'))
writer.writerows(b)
Ok here we go. Not quite sure what I'm helping you do here, but hopefully you have a good reason to be analyzing silk road posts.
You have a few issues here, the big one is that you aren't parsing the data at all. What you're essentially doing with .get_text() is going to the page, highlighting the whole thing, and then copying and pasting the whole thing to a csv file.
So here is what you should be trying to do:
Read the page source
Use soup to break it into sections you want
Save sections in parallel arrays for author, date, time, post, etc
Write data to csv file row by row
I wrote some code to show you what that looks like, it should do the job:
from bs4 import BeautifulSoup
import csv
import urllib2
# get page source and create a BeautifulSoup object based on it
print "Reading page..."
page = urllib2.urlopen("https://silkroad5v7dywlc.onion.to/index.php?action=printpage;topic=28536.0")
soup = BeautifulSoup(page)
# if you look at the HTML all the titles, dates,
# and authors are stored inside of <dt ...> tags
metaData = soup.find_all("dt")
# likewise the post data is stored
# under <dd ...>
postData = soup.find_all("dd")
# define where we will store info
titles = []
authors = []
times = []
posts = []
# now we iterate through the metaData and parse it
# into titles, authors, and dates
print "Parsing data..."
for html in metaData:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
titles.append(text.split("Title:")[1].split("Post by:")[0].strip()) # get Title:
authors.append(text.split("Post by:")[1].split(" on ")[0].strip()) # get Post by:
times.append(text.split(" on ")[1].strip()) # get date
# now we go through the actual post data and extract it
for post in postData:
posts.append(BeautifulSoup(str(post)).get_text().encode("utf-8").strip())
# now we write data to csv file
# ***csv files MUST be opened with the 'b' flag***
csvfile = open('silkroad.csv', 'wb')
writer = csv.writer(csvfile)
# create template
writer.writerow(["Time", "Author", "Title", "Post"])
# iterate through and write all the data
for time, author, title, post in zip(times, authors, titles, posts):
writer.writerow([time, author, title, post])
# close file
csvfile.close()
# done
print "Operation completed successfully."
EDIT: Included solution that can read files from directory and use data from that
Okay, so you have your HTML files in a directory. You need to get a list of files in the directory, iterate through them, and append to your csv file for each file in the directory.
This is the basic logic of our new program.
If we had a function called processData() that took a file path as an argument and appended data from the file to your csv file here is what it would look like:
# the directory where we have all our HTML files
dir = "myDir"
# our csv file
csvFile = "silkroad.csv"
# insert the column titles to csv
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Time", "Author", "Title", "Post"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # increment counter
As it happens our processData() function is more or less what we did before, with a few changes.
So this is very similar to our last program, with a few small changes:
We write the column headers first thing
Following that we open the csv with the 'ab' flag to append
We import os to get a list of files
Here's what that looks like:
from bs4 import BeautifulSoup
import csv
import urllib2
import os # added this import to process files/dirs
# ** define our data processing function
def processData( pageFile ):
''' take the data from an html file and append to our csv file '''
f = open(pageFile, "r")
page = f.read()
f.close()
soup = BeautifulSoup(page)
# if you look at the HTML all the titles, dates,
# and authors are stored inside of <dt ...> tags
metaData = soup.find_all("dt")
# likewise the post data is stored
# under <dd ...>
postData = soup.find_all("dd")
# define where we will store info
titles = []
authors = []
times = []
posts = []
# now we iterate through the metaData and parse it
# into titles, authors, and dates
for html in metaData:
text = BeautifulSoup(str(html).strip()).get_text().encode("utf-8").replace("\n", "") # convert the html to text
titles.append(text.split("Title:")[1].split("Post by:")[0].strip()) # get Title:
authors.append(text.split("Post by:")[1].split(" on ")[0].strip()) # get Post by:
times.append(text.split(" on ")[1].strip()) # get date
# now we go through the actual post data and extract it
for post in postData:
posts.append(BeautifulSoup(str(post)).get_text().encode("utf-8").strip())
# now we write data to csv file
# ***csv files MUST be opened with the 'b' flag***
csvfile = open('silkroad.csv', 'ab')
writer = csv.writer(csvfile)
# iterate through and write all the data
for time, author, title, post in zip(times, authors, titles, posts):
writer.writerow([time, author, title, post])
# close file
csvfile.close()
# ** start our process of going through files
# the directory where we have all our HTML files
dir = "myDir"
# our csv file
csvFile = "silkroad.csv"
# insert the column titles to csv
csvfile = open(csvFile, 'wb')
writer = csv.writer(csvfile)
writer.writerow(["Time", "Author", "Title", "Post"])
csvfile.close()
# get a list of files in the directory
fileList = os.listdir(dir)
# define variables we need for status text
totalLen = len(fileList)
count = 1
# iterate through files and read all of them into the csv file
for htmlFile in fileList:
path = os.path.join(dir, htmlFile) # get the file path
processData(path) # process the data in the file
print "Processed '" + path + "'(" + str(count) + "/" + str(totalLen) + ")..." # display status
count = count + 1 # incriment counter
I am reading an Excel file using xlrd. In one column I have a company name which is formatted as a hyperlink (meaning there is a URL behind it). When I get the cell value I only get the company name. How can I also get the URL behind it?
Below is the code for reading an Excel file using the xlrd module (assume files are imported).
mainData_book = xlrd.open_workbook("IEsummary.xls", formatting_info=True)
mainData_sheet = mainData_book.sheet_by_index(0) # Get the first sheet 0
start = 1
end = 101
for counter in range(start, end):
rowValues = mainData_sheet.row_values(counter, start_colx=0, end_colx=8)
company_name = rowValues[0] #how i can get link here also??
In xlrd 0.7.2 or newer, you can use hyperlink_map:
import xlrd
mainData_book = xlrd.open_workbook("IEsummary.xls", formatting_info=True)
mainData_sheet = mainData_book.sheet_by_index(0)
for row in range(1, 101):
rowValues = mainData_sheet.row_values(row, start_colx=0, end_colx=8)
company_name = rowValues[0]
link = mainData_sheet.hyperlink_map.get((row, 0))
url = '(No URL)' if link is None else link.url_or_path
print(company_name.ljust(20) + ': ' + url)