Note: Beginner I'm trying to create an html parser that will go through a folder and all its subfolders and export all html files without any html tags, in file formats CSV and TXT with each html labeled with the title of the web page in a new CSV and TXT.
However I keep getting an error saying: *"Traceback (most recent call last): File "/Users/username/Documents/htmlparser/parser10.py", line 59, in <module> for subentry in os.scandir(entry.path):NotADirectoryError: [Errno 20] Not a directory: '/Users/username/site/.DS_Store'"* Here's what I've done so far (I have bolded line 59): " import bs4 as bs import csv import glob import os import re directory = "/Users/username/site" with os.scandir(directory) as it: for entry in it: if ".html" in entry.name or re.match(r'.*?(?:\.html?$|\.html?\?.*)', entry.name) is not None: print(entry.name, entry.path) my_data = (entry) listofp = [] soup = bs.BeautifulSoup(open(my_data, "r").read()) for paragraph in soup.find_all('p'): listofp.append(paragraph.string) title = soup.title.string leftitle = [title] listception = [leftitle] for moreshit in soup.find_all('h1', 'h2', 'h3', 'h4', 'h5'): listception.append([str(moreshit.text)]) for paragraph in soup.find_all('p'): listception.append([str(paragraph.text)]) for elements in soup.find_all('li', 'td', 'div', 'span'): listception.append([str(elements.text)]) for evenmoreshit in soup.find_all('h6', 'a'): listception.append([str(evenmoreshit.text)]) num = 0 with open('export/' + title + '.csv', 'w') as csv_file: writer = csv.writer(csv_file, delimiter=',') writer.writerows(listception) file_path = os.path.join(directory, entry) text = open(file_path, mode='r').read() results = str(listception).strip('[]') results = results.replace("[", " ") results = results.replace("]", " ") results = results.replace("""\n""", " ") results_dir = "/Users/username/site/done" results_file = title + '.txt' file_path = os.path.join(results_dir, results_file) open(file_path, mode='w', encoding='UTF-8').write(results) continue * for subentry in os.scandir(entry.path):* for file in os.scandir(subentry.path): if ".html" in entry.name or re.match(r'.*?(?:\.html?$|\.html?\?.*)', entry.name) is not None: print(entry.name, entry.path) my_data = (entry) listofp = [] soup = bs.BeautifulSoup(open(my_data, "r").read()) for paragraph in soup.find_all('p'): listofp.append(paragraph.string) title = soup.title.string leftitle = [title] listception = [leftitle] for moreshit in soup.find_all('h1', 'h2', 'h3', 'h4', 'h5'): listception.append([str(moreshit.text)]) for paragraph in soup.find_all('p'): listception.append([str(paragraph.text)]) for elements in soup.find_all('li', 'td', 'div', 'span'): listception.append([str(elements.text)]) for evenmoreshit in soup.find_all('h6', 'a'): listception.append([str(evenmoreshit.text)]) num = 0 with open('export/' + title + '.csv', 'w') as csv_file: writer = csv.writer(csv_file, delimiter=',') writer.writerows(listception) file_path = os.path.join(directory, entry) text = open(file_path, mode='r').read() results = str(listception).strip('[]') results = results.replace("[", " ") results = results.replace("]", " ") results = results.replace("""\n""", " ") results_dir = "/Users/username/site/done" results_file = title + '.txt' file_path = os.path.join(results_dir, results_file) open(file_path, mode='w', encoding='UTF-8').write(results) continue Would love any help whatsoever or any suggestions of any kind. Thank you very much! -- https://mail.python.org/mailman/listinfo/python-list