bin/ooxml-analyze.py | 99 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 35 deletions(-)
New commits: commit f2bde987693fad6e1347f99e34c2ad5291ea8ee6 Author: Gülşah Köse <gulsah.k...@collabora.com> AuthorDate: Thu Jun 3 14:11:05 2021 +0300 Commit: Gülşah Köse <gulsah.k...@collabora.com> CommitDate: Thu Jun 3 14:15:14 2021 +0300 Concanate seperate texts runs and create seperate result file fot it. For eg: <a:p> <a:r> <a:t>text1</a:t> </a:r> <a:r> <a:t>text2</a:t> </a:r> </a:p> We will keep the result text as "text1text2" As result we will create <file name>.text to hold that type texts. Change-Id: I946af39e2037db1f986e73039d0a462a36bba1d8 diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index 9db39d8c47da..a7e2bc2a549f 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -1,8 +1,9 @@ #!/usr/bin/python -import sys, getopt, os, shutil, pprint +import sys, getopt, os, shutil import xml.etree.ElementTree as ET from zipfile import ZipFile +from lxml import etree def main(argv): inputdir = '' @@ -28,9 +29,6 @@ def main(argv): elif opt in ("-o", "--odir"): outputdir = arg - # holds the result structer of analyze - result_list = [] - if(extracted_files_dir_by_user == ''): # use default directory path for extracted ooxml files. extracted_files_dir = os.path.join(outputdir, 'extractedfiles') @@ -40,22 +38,39 @@ def main(argv): extracted_files_dir = extracted_files_dir_by_user # create seperate result files for each ooxml document as <document name>.result in output directory + # create seperate concanated texts for each ooxml document as <document name>.text in output directory for ext_dir in get_list_of_subdir(extracted_files_dir): i = ext_dir.rfind('/') sub_result_name = ext_dir[i+1:] + ".result" + sub_texts_name = ext_dir[i+1:] + ".text" sub_result_list = [] - count_elements(ext_dir, sub_result_list) + concatenated_texts_list = [] # holds concanated texts for each paragraph + count_elements(ext_dir, sub_result_list, concatenated_texts_list) + sub_result_path = os.path.join(outputdir, sub_result_name) + sub_texts_path = os.path.join(outputdir, sub_texts_name) # sort the result sub list according to tag names sub_result_list = sorted(sub_result_list, key=lambda x: list(x[0].keys())[0], reverse=False) + concatenated_texts_list.sort() if os.path.exists(sub_result_path): os.remove(sub_result_path) + if os.path.exists(sub_texts_path): + os.remove(sub_texts_path) + for i in sub_result_list: with open(sub_result_path, "a") as log_file: print(i, file=log_file) - log_file.close() + log_file.close() + for i in concatenated_texts_list: + with open(sub_texts_path, "a") as log_file: + print(i, file=log_file) + log_file.close() + + # no need to keep extracted files anymore. + if(os.path.exists(extracted_files_dir)): + shutil.rmtree(extracted_files_dir) # unzip all ooxml files into the given path def extract_files(inputdir, extracted_files_dir): @@ -98,6 +113,7 @@ def replace_namespace_with_alias(filename, element): element = element.replace("{" + element_ns + "}", "") return element +# decides which files shouldn't be analyzed. def is_file_in_accepted_files(filename): if(filename.endswith("[Content_Types].xml") or \ filename.endswith("docProps/custom.xml") or \ @@ -109,6 +125,7 @@ def is_file_in_accepted_files(filename): "ppt/slideLayouts" in filename or \ "ppt/slideMasters" in filename or \ "ppt/theme" in filename or \ + "ppt/notesMasters" in filename or \ filename.endswith("docProps/core.xml") or not \ filename.endswith(".xml")): return False @@ -116,7 +133,7 @@ def is_file_in_accepted_files(filename): return True # counts tags, attribute names and values of xmls -def count_elements(extracted_files_dir, result_list): +def count_elements(extracted_files_dir, result_list, concanated_texts_list): # make sure if extracted files directory exist if not (os.path.exists(extracted_files_dir)): @@ -131,40 +148,52 @@ def count_elements(extracted_files_dir, result_list): continue print(xmlfile) - tree = ET.parse(xmlfile) - root = tree.getroot() # start to count - for child in root.iter(): + for event, child in etree.iterparse(xmlfile, events=('start', 'end')): tag = replace_namespace_with_alias(xmlfile, child.tag) tag_idx = get_index_of_tag(tag, result_list) - # count tags - if (tag_idx == -1): - tmp_list = [{tag: 1},{},{},{}] - result_list.append(tmp_list) - else: - result_list[tag_idx][0][tag] += 1 - - # count attribute names and values of current tag - for attr_name, attr_value in child.attrib.items(): - attr_name = replace_namespace_with_alias(xmlfile, attr_name) - if not attr_name in result_list[tag_idx][1].keys(): - result_list[tag_idx][1][attr_name] = 1 - else: - result_list[tag_idx][1][attr_name] +=1 - - if not attr_value in result_list[tag_idx][2].keys(): - result_list[tag_idx][2][attr_value] = 1 - else: - result_list[tag_idx][2][attr_value] +=1 - - # count text contents except consisted of whitespaces. - if not (str(child.text) == "None" or str(child.text).strip()==""): - if not child.text in result_list[tag_idx][3].keys(): - result_list[tag_idx][3][child.text] = 1 + if event == "start": + # count tags + if (tag_idx == -1): + tmp_list = [{tag: 1},{},{},{}] + result_list.append(tmp_list) else: - result_list[tag_idx][3][child.text] += 1 + result_list[tag_idx][0][tag] += 1 + + # count attribute names and values of current tag + for attr_name, attr_value in child.attrib.items(): + attr_name = replace_namespace_with_alias(xmlfile, attr_name) + if not attr_name in result_list[tag_idx][1].keys(): + result_list[tag_idx][1][attr_name] = 1 + else: + result_list[tag_idx][1][attr_name] +=1 + + if not attr_value in result_list[tag_idx][2].keys(): + result_list[tag_idx][2][attr_value] = 1 + else: + result_list[tag_idx][2][attr_value] +=1 + + # concanated text will be resetted in every paragraph begining + if tag == "a:p": + concatenated_text = "" + + + if event == "end": + # Detect seperate texts in paragraph and concanate them. + if tag == "a:t": + concatenated_text += str(child.text) + # End of the paragraph element, add the text as list item. + if tag == "a:p" and concatenated_text != "": + concanated_texts_list.append(concatenated_text) + + # count text contents except consisted of whitespaces. + if not (str(child.text) == "None" or str(child.text).strip()==""): + if not child.text in result_list[tag_idx][3].keys(): + result_list[tag_idx][3][child.text] = 1 + else: + result_list[tag_idx][3][child.text] += 1 # gets the position of "tag" element in result list. If element is not exist, # return -1 that points the last index of the list. _______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits