Cannot overwrite CSV file in Python - python

I have already searched for an answer here and spent a long time on google, but nothing...
I've already tried opening the file with 'w' instead of 'r' or 'a' but I still can't get my code to overwrite the current results I have originally written to the CSV file. I'm basically scraping information from a website and I want to first search for a term, scrape that data, save it to the CSV file AND THEN, search for another term and scrape that data and overwrite the current CSV file with the new data.
#from pyvirtualdisplay import Display
import csv
from bs4 import BeautifulSoup
import urllib.request
def getPageSource(current_page):
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib.request.Request(current_page, headers=hdr)
page = urllib.request.urlopen(req)
soup = BeautifulSoup(page, "html5lib")
def get_length(file_path):
with open("data.csv", 'r', encoding='utf8') as csvfile:
reader = csv.reader(csvfile)
reader_list = list(reader)
return len(reader_list)
def write_data(file_path, company_name, role, full_url, date):
fieldnames = ['ID', 'Company','Role', 'URL', 'Date']
next_id = get_length(file_path)
with open(file_path, "w", encoding='utf8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
"ID": next_id,
"Company": company_name,
"Role": role,
"URL": full_url,
"Date": date
def find_data(source):
base_url = ''
for a in source.find_all(attrs={"itemtype" : ""}):
job_info = a.find('h2').find('a')
company_name = a.find('h3').find('a').get_text()
url = job_info['href']
full_url = (base_url + url)
role = (job_info.get_text())
date = a.find('li',class_='updated-time').get_text().replace('Updated','').strip()
write_data("data.csv", company_name, role, full_url, date)
if __name__ == '__main__':
query = input('Enter role to search: ')
source = getPageSource(''+query+'&Location=102&Category=3&Recruiter=All&SortBy=MostRecent&PerPage=100')

You need to keep the file open until you have finished writing it. Also, it is easier to keep a count of the rows written (using enumerate()) than to keep trying to read the file back in:
import csv
from bs4 import BeautifulSoup
import urllib.request
def getPageSource(current_page):
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = urllib.request.Request(current_page, headers=hdr)
page = urllib.request.urlopen(req)
return (BeautifulSoup(page, "html5lib"))
def find_data(source):
base_url = ''
fieldnames = ['ID', 'Company','Role', 'URL', 'Date']
with open('data.csv', 'w', encoding='utf8', newline='') as csvfile:
writer = csv.writer(csvfile)
for id, a in enumerate(source.find_all(attrs={"itemtype" : ""}), start=1):
job_info = a.find('h2').find('a')
company_name = a.find('h3').find('a').get_text()
url = job_info['href']
full_url = (base_url + url)
role = (job_info.get_text())
date = a.find('li',class_='updated-time').get_text().replace('Updated','').strip()
writer.writerow([id, company_name, role, full_url, date])
if __name__ == '__main__':
query = input('Enter role to search: ')
source = getPageSource(''+query+'&Location=102&Category=3&Recruiter=All&SortBy=MostRecent&PerPage=100')
This would give you data.csv starting:
1,Computer Futures,Xamarin Developer,,06/03/2018
2,Wallace Myers International,New Business Development Manager,,06/03/2018
3,Reperio Human Capital Ltd,Senior Software Developer - Dublin,,20/03/2018
In your case, it is probably easier to just use a plain csv.writer() rather than a Dictwriter().


ExpatError: not well-formed (invalid token): line 6, column 519

I am trying to parse the elements of a webpage from and convert it to JSON format to use the details of the Players. page -
I am getting an error below :
ExpatError: not well-formed (invalid token): line 6, column 519
My code is below -
import xmltojson
import json
import requests
# Sample URL to fetch the html page
url = ""
# Headers to mimic the browser
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/ Safari/537.36',
'Connection': 'keep-alive'
# Get the page through get() method
html_response = requests.get(url=url, headers = headers)
# Save the page content as sample.html
with open("sample.html", "w", encoding="utf-8") as html_file:
with open("sample.html", "r", encoding = "utf-8") as html_file:
html =
json_ = xmltojson.parse(html)
with open("data.json", "w") as file:
json.dump(json_, file)

Scraping Data from booking with python

hope you're doing well !
So i'm trying to scrape data from Booking (name of Hotel , room ..) , i run the code it's work but i don't get the data in the excel file, the data file is empty !
This is my code :
# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('C:/Users/pc/OneDrive/Bureau/booking-hotel-scraper-master/booking.yml')
def scrape(url):
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
# You may want to change the user agent if you get blocked
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Referer': '',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
# Download the page using requests
print("Downloading %s"%url)
r = requests.get(url, headers=headers)
# Pass the HTML of the page and create
return e.extract(r.text,base_url=url)
with open("C:/Users/pc/OneDrive/Bureau/booking-hotel-scraper-master/urls.txt",'r') as urllist, open('C:/Users/pc/OneDrive/Bureau/booking-hotel-scraper-master/data.csv','w') as outfile:
fieldnames = [
writer = csv.DictWriter(outfile, fieldnames=fieldnames,quoting=csv.QUOTE_ALL)
for url in urllist.readlines():
data = scrape(url)
if data and data['hotels'] is not None:
for h in data["hotels"]:
And this is the result in the excel file :
There is no error in mycode it's only about how to get this data.
The booking.yml :

Read URLs from external file

I found the following TikTok Downloader which is working fine.
from argparse import ArgumentParser
import os
from urllib.parse import parse_qsl, urlparse
import requests
class TikTokDownloader:
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Dest': 'video',
'Referer': '',
'Accept-Language': 'en-US,en;q=0.9,bs;q=0.8,sr;q=0.7,hr;q=0.6',
'sec-gpc': '1',
'Range': 'bytes=0-',
def __init__(self, url: str, web_id: str):
self.__url = url
self.__cookies = {
'tt_webid': web_id,
'tt_webid_v2': web_id
def __get_video_url(self) -> str:
response = requests.get(self.__url, cookies=self.__cookies, headers=TikTokDownloader.HEADERS)
return response.text.split('"playAddr":"')[1].split('"')[0].replace(r'\u0026', '&')
def download(self, file_path: str):
video_url = self.__get_video_url()
url = urlparse(video_url)
params = tuple(parse_qsl(url.query))
request = requests.Request(method='GET',
url.netloc, url.path),
prepared_request = request.prepare()
session = requests.Session()
response = session.send(request=prepared_request)
if os.path.exists(file_path):
choice = input('File already exists. Overwrite? (Y/N): ')
if choice.lower() != 'y':
with open(os.path.abspath(file_path), 'wb') as output_file:
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument('--web-id', help='Value of tt_webid or tt_webid_v2 cookie (they are the same).')
parser.add_argument('-o', '--output', default='download.mp4', help='Full output path.')
parser.add_argument('url', help='Video url ( or')
args = parser.parse_args()
downloader = TikTokDownloader(args.url, args.web_id)
The issue is that I have to run this command to download each video:
python3 ./ --web-id 1234567890123 -o ./file.mp4
And I have 1000 links to download. All the links are in A txt file without comma. Like:
So- I'm looking to find a way to read the text file and automatically replace the link in the command that I have to run. Or should I change the actual script?
Use my code please, I have just defined a function that will help you to download all those videos by just entering the path where the file with a thousand links is located, preferably save this python script in the same directory where your file with a thousand links is located:
Use the function
This is going to put automatic names to each video based on date and time, I tested it and it works!
Here is the code by jbsidis:
from argparse import ArgumentParser
import os
from urllib.parse import parse_qsl, urlparse
import requests
class TikTokDownloaderjbsidis:
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Dest': 'video',
'Referer': '',
'Accept-Language': 'en-US,en;q=0.9,bs;q=0.8,sr;q=0.7,hr;q=0.6',
'sec-gpc': '1',
'Range': 'bytes=0-',
def __init__(self, url, web_id):
self.__url = url
self.__cookies = {
'tt_webid': web_id,
'tt_webid_v2': web_id
def __get_video_url(self) -> str:
response = requests.get(self.__url, cookies=self.__cookies, headers=TikTokDownloaderjbsidis.HEADERS)
return response.text.split('"playAddr":"')[1].split('"')[0].replace(r'\u0026', '&')
def download(self, file_path: str):
video_url = self.__get_video_url()
url = urlparse(video_url)
params = tuple(parse_qsl(url.query))
request = requests.Request(method='GET',
url.netloc, url.path),
prepared_request = request.prepare()
session = requests.Session()
response = session.send(request=prepared_request)
if os.path.exists(file_path):
choice = str('jbsidis File already exists. Overwrite? (Y/N): ')
print("Downloading jbsidis == "+str(file_path))
with open(os.path.abspath(file_path), 'wb') as output_file:
import time
import random
def A_thousand_links_jbsidis(file_with_a_thousand_links):
m=n.splitlines() #guessing the links are per line
MyWebIDis="1234567890123" #put the id that works for you
for new_url in m:
new_auto_file_name=str(c)+" - "+str(time.strftime("_%Y%m%d_%H%M%S_"))+"_video_"+".mp4" #i guess they are mp4
clean_url=str(new_url).replace("\n","").replace("\x0a","").replace("\x0d","").replace(" ","")
downloader = TikTokDownloaderjbsidis(clean_url, MyWebIDis)
time.sleep(10) #just in case the internet is not that fast, wait 10 seconds after next download
And here is the image, I don't know why sometimes we answer questions without giving a real solution, greetings from El Salvador.

Formatting scraped data Python Beautifulsoup

I am trying to scrape from this URL design names, creator names, fabric types, prices as per fabric type
The good thing is they have public API endpoints which make the data extraction simple
But the problem is they have different URLs for design names and for pricing
i.e to collect names of design and the creator name I have to ping this URL
And for pricing per fabric type requesting this endpoint
I am getting correct data but the problem I stumbled across some formatting issues.
What I am looking for is something like this.
Each design with its fabric type alongside its prices in a single row. Instead, I am getting this kind of output
It would be great if anyone here can guide me through this like how to get the expected_output_result I am looking for.
Below is my code:
import requests
from bs4 import BeautifulSoup
import json
import csv
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': '',
'Connection': 'keep-alive',
'Referer': '',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
res = requests.get('')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
item_endpoint = ''
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
scraped_items = []
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = '' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name'],
screeName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
test_swatch_meter = 'N/A'
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
fat_quarter_meter = 'N/A'
meter = details_endpoint_response['data']['pricing']['METER']['price']
meter = 'N/A'
'designName': designName,
'screenName': screeName,
'fabric_name': fabric_name,
'test_swatch_meter': test_swatch_meter,
'fat_quarter_meter': fat_quarter_meter,
'meter': meter
print(designName, screeName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
print(json.dumps(scraped_items, indent=2))
with open('scraped_data.csv', 'w', newline='') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=scraped_items[0].keys())
for row in scraped_items:
One way to do it is just reconfigure how you construct the output. Instead of a list, use a dictionary where designName, screenName, followed by the values. One thing to keep in mind is dictionaries don't allow duplicate keys, so had to number the column names, however you can remove those later if you'd like.
See if this gets what you are wanting:
import requests
from bs4 import BeautifulSoup
import json
import csv
import pandas as pd
from collections import OrderedDict
cookies = {
'b': '1.2qu49mazdxsj0.40fc8b88.quqq3d.9q7z',
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-Spoonflower-Window-UUID': 'a9bc37a2-9eb2-4a1e-8ea1-fcee89347364',
'Content-Type': 'application/json',
'Origin': '',
'Connection': 'keep-alive',
'Referer': '',
'Sec-GPC': '1',
'If-None-Match': 'W/95d6572c326b81ce98c7ae27ac449d42',
'TE': 'Trailers',
res = requests.get('')
soup = BeautifulSoup(res.text, 'lxml')
fabrics = [fabric.find('h2').text.strip() for fabric in soup.find_all('div', {'class': 'product_detail medium_text'})]
fabric = []
for fab in fabrics:
item_endpoint = ''
item_response = requests.get(item_endpoint).json()
#item_data = items_json['page_results'][0]
items_dict = OrderedDict()
for item in item_response['page_results']:
for fab_type in fabric:
details_endpoint = '' + fab_type + '?quantity=1&shipping_country=PK&currency=EUR&measurement_system=METRIC&design_id='+ str(item['designId']) + '&page_locale=en'
details_endpoint_response = requests.get(details_endpoint, headers=headers, cookies=cookies).json()
designName = item['name']
screenName = item['user']['screenName']
fabric_name = details_endpoint_response['data']['fabric_code']
test_swatch_meter = details_endpoint_response['data']['pricing']['TEST_SWATCH_METER']['price']
test_swatch_meter = 'N/A'
fat_quarter_meter = details_endpoint_response['data']['pricing']['FAT_QUARTER_METER']['price']
fat_quarter_meter = 'N/A'
meter = details_endpoint_response['data']['pricing']['METER']['price']
meter = 'N/A'
if (designName, screenName) not in items_dict.keys():
items_dict[(designName, screenName)] = {}
itemCount = len(items_dict[(designName, screenName)].values()) / 4
items_dict[(designName, screenName)].update({'fabric_name_%02d' %itemCount: fabric_name,
'test_swatch_meter_%02d' %itemCount: test_swatch_meter,
'fat_quarter_meter_%02d' %itemCount: fat_quarter_meter,
'meter_%02d' %itemCount: meter})
print(designName, screenName, fabric_name, test_swatch_meter,fat_quarter_meter, meter)
df = pd.DataFrame.from_dict(items_dict, orient='index').reset_index(drop=False)
df = df.rename(columns={'level_0':'designName','level_1':'screenName'})
df.to_csv('scraped_data.csv', index=False)

Populate a csv file with scraped data

I'm having trouble with this and I know it is basic as I was blocked for 2 days for asking a similar question on Friday, but I'm really struggling so would appreciate help.
How do I edit the code below to populate the csv I have created with the table I have pulled from the airport site i.e. the flight arrival data?
import requests
import csv, sys
from bs4 import BeautifulSoup
cookies = {
'ApplicationGatewayAffinity': '1d2ad8ab214d1293a4e31bcd161589fa82a54a39bb7b3be80b503996092d4296',
'ApplicationGatewayAffinityCORS': '1d2ad8ab214d1293a4e31bcd161589fa82a54a39bb7b3be80b503996092d4296',
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': '',
'Accept-Language': 'en-GB,en;q=0.9,en-US;q=0.8,fr;q=0.7,nl;q=0.6',
response = requests.get('', headers=headers, cookies=cookies)
soup = BeautifulSoup(response.content, 'html.parser')
writer = csv.writer(sys.stdout)
'Arriving From'
'Scheduled to arrive', 'Latest Update', 'Status'
with open('flight.csv','w') as f:
[table] = soup.find_all("table")
for row in table.find_all("tr"):
[td.string.strip() for td in row.find_all("td")]
writer = csv.writer(f)
One minor error, you need to have the writer = csv.writer(f) in the with block
with open('flight.csv','w') as f:
[table] = soup.find_all("table")
writer = csv.writer(f)
for row in table.find_all("tr"):
[td.string.strip() for td in row.find_all("td")]

