Have a look at: https://docs.python.org/3/library/typing.html
Il giorno venerdì 5 giugno 2020 18:35:10 UTC+2, Agnese Camellini ha scritto: > Hello to everyone, lately i building up an open source project, with some > collaborator, but one of them cannot contribute any more. He is a solution > architect so he is very skilled (much more than me!). I am now analysing > his code to finish the job but i don't get this use of the lambda arrow, > it's like he is deplaring the returned tipe in the function signature (as > you would do in Java). I have never seen something like this in python.. > > Can someone please explain to me this usage (the part regarding the > question is highlighted in yellow): > > @classmethod > def extract_document_data(cls, file_path : str) -> DocumentData: > """ > Entry point of the module, it extracts the data from the document > whose path is passed as input. > The extraction strategy is automatically chosen based on the MIME > type > of the file. > > @type file_path: str > @param file_path: The path of the document to be parsed. > @rtype: DocumentData > @returns: An object containing the data of the parsed document. > """ > > mime = magic.Magic(mime=True) > mime_type = mime.from_file(file_path) > document_type = DocumentType.get_instance(mime_type) > strategy = cls.strategies[document_type] > return strategy.extract_document_data(file_path) > > > To be more verbose, this is the whole script: > > from enum import Enum > import json > import magic > > import docx > from pdfminer.converter import PDFPageAggregator > from pdfminer.layout import LAParams, LTContainer, LTTextContainer > from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines > from pdfminer.pdfinterp import PDFPageInterpreter > from pdfminer.pdfinterp import PDFResourceManager > from pdfminer.pdfpage import PDFPage > from pdfminer.pdfparser import PDFParser > > > class DocumentType(Enum): > """ > Defines the handled document types. > Each value is associated to a MIME type. > """ > > def __init__(self, mime_type): > self.mime_type = mime_type > > @classmethod > def get_instance(cls, mime_type : str): > values = [e for e in cls] > for value in values: > if value.mime_type == mime_type: > return value > raise MimeNotValidError(mime_type) > > PDF = 'application/pdf' > DOCX = > 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' > > > class MimeNotValidError(Exception): > """ > Exception to be raised when a not valid MIME type is processed. > """ > > pass > > > class DocumentData: > """ > Wrapper for the extracted document data (TOC and contents). > """ > > def __init__(self, toc : list = [], pages : list = [], document_text : > str = None): > self.toc = toc > self.pages = pages > if document_text is not None: > self.document_text = document_text > else: > self.document_text = ' '.join([page.replace('\n', ' ') for page > in pages]) > > def toc_as_json(self) -> str: > return json.dumps(self.toc) > > > class ExtractionStrategy: > """ > Base class for the extraction strategies. > """ > > @staticmethod > def extract_document_data(file_path : str) -> DocumentData: > pass > > > class DOCXExtractionStrategy(ExtractionStrategy): > """ > It implements the TOC and contents extraction from a DOCX document. > """ > > @staticmethod > def extract_document_data(file_path : str) -> DocumentData: > document = docx.Document(file_path) > body_elements = document._body._body > # Selecting only the <w:t> elements from DOCX XML, > # as they're the only to contain some text. > text_elems = body_elements.xpath('.//w:t') > return DocumentData(document_text = ' '.join([elem.text for elem in > text_elems])) > > > class PDFExtractionStrategy(ExtractionStrategy): > """ > It implements the TOC and contents extraction from a PDF document. > """ > > @staticmethod > def parse_toc(doc : PDFDocument) -> list: > raw_toc = [] > try: > outlines = doc.get_outlines() > for (level, title, dest, a, se) in outlines: > raw_toc.append((level, title)) > except PDFNoOutlines: > pass > return PDFExtractionStrategy.build_toc_tree(raw_toc) > > @staticmethod > def build_toc_tree(items : list) -> list: > """ > Builds the TOC tree from a list of TOC items. > > @type items: list > @param items: The TOC items. > Each item must have the following format: (<item depth>, <item > description>). > E.g: [(1, 'Contents'), (2, 'Chapter 1'), (2, 'Chapter 2')] > @rtype: list > @returns: The TOC tree. The tree hasn't a root element, therefore it > actually is a list. > """ > > toc = [] > if items is None or len(items) == 0: > return toc > current_toc_level = toc > # Using an explicit stack containing the lists corresponding to > # the various levels of the TOC, to simulate the recursive building > # of the TOC tree in a more efficient way > toc_levels_stack = [] > toc_levels_stack.append(current_toc_level) > > # Each TOC item can be inserted into the current TOC level as > # string (just the item description) or as dict, where the key is > # the item description and the value is a list containing the > # children TOC items. > # To correctly determine how to insert the current item into > # the current level, a kind of look-ahead is needed, that is > # the depth of the next item has to be considered. > > # Initializing the variables related to the previous item. > prev_item_depth, prev_item_desc = items[0] > # Adding a fake final item in order to handle all the TOC items > # inside the cycle. > items.append((-1, '')) > > for i in range(1, len(items)): > # In fact each iteration handles the item of the previous > # one, using the current item to determine how to insert > # the previous item into the current TOC level, > # as explained before. > curr_item = items[i] > curr_item_depth = curr_item[0] > > if curr_item_depth == prev_item_depth: > # The depth of the current item is the same > # as the previous one. > # Inserting the previous item into the current TOC level > # as string. > current_toc_level.append(prev_item_desc) > elif curr_item_depth == prev_item_depth + 1: > # The depth of the current item is increased by 1 compared > to > # the previous one. > # Inserting the previous item into the current TOC level > # as dict. > prev_item_dict = { prev_item_desc : [] } > current_toc_level.append(prev_item_dict) > # Updating the current TOC level with the newly created one > # which contains the children of the previous item. > current_toc_level = prev_item_dict[prev_item_desc] > toc_levels_stack.append(current_toc_level) > elif curr_item_depth < prev_item_depth: > # The depth of the current item is lesser than > # the previous one. > # Inserting the previous item into the current TOC level > # as string. > current_toc_level.append(prev_item_desc) > if i < len(items)-1: > # Executing these steps for all the items except the > last one > depth_diff = prev_item_depth - curr_item_depth > # Removing from the stack as many TOC levels as the > difference > # between the depth of the previous item and the depth > of the > # current one. > for i in range(0, depth_diff): > toc_levels_stack.pop() > # Updating the current TOC level with the one contained > in > # the head of the stack. > current_toc_level = toc_levels_stack[-1] > # Updating the previous item with the current one > prev_item_depth, prev_item_desc = curr_item > > return toc > > @staticmethod > def from_bytestring(s) -> str: > """ > If the input string is a byte-string, converts it to a string using > UTF-8 as encoding. > > @param s: A string or a byte-string. > @rtype: str > @returns: The potentially converted string. > """ > > if s: > if isinstance(s, str): > return s > else: > return s.encode('utf-8') > > @staticmethod > def parse_layout_nodes(container : LTContainer) -> str: > """ > Recursively extracts the text from all the nodes contained in the > input PDF layout tree/sub-tree. > > @type container: LTContainer > @param container: The PDF layout tree/sub-tree from which to > extract the text. > @rtype: str > @returns: A string containing the extracted text. > """ > > text_content = [] > > # The iterator returns the children nodes. > for node in container: > if isinstance(node, LTTextContainer): > # Only nodes of type LTTextContainer contain text. > > text_content.append(PDFExtractionStrategy.from_bytestring(node.get_text())) > elif isinstance(node, LTContainer): > # Recursively calling the method on the current node, which > is a container itself. > > text_content.append(PDFExtractionStrategy.parse_layout_nodes(node)) > else: > # Ignoring all the other node types. > pass > > # Joining all the extracted text chunks with a new line character. > return "\n".join(text_content) > > @staticmethod > def parse_pages(doc : PDFDocument) -> list: > rsrcmgr = PDFResourceManager() > laparams = LAParams() > device = PDFPageAggregator(rsrcmgr, laparams=laparams) > interpreter = PDFPageInterpreter(rsrcmgr, device) > > text_content = [] > for i, page in enumerate(PDFPage.create_pages(doc)): > interpreter.process_page(page) > layout = device.get_result() > # Extracts the text from all the nodes of the PDF layout tree > of each page > > text_content.append(PDFExtractionStrategy.parse_layout_nodes(layout)) > > return text_content > > @staticmethod > def parse_pdf(file_path : str) -> (list, list): > toc = [] > pages = [] > try: > fp = open(file_path, 'rb') > parser = PDFParser(fp) > doc = PDFDocument(parser) > parser.set_document(doc) > > if doc.is_extractable: > toc = PDFExtractionStrategy.parse_toc(doc) > pages = PDFExtractionStrategy.parse_pages(doc) > > fp.close() > except IOError: > pass > return (toc, pages) > > @staticmethod > def extract_document_data(file_path : str) -> DocumentData: > toc, pages = PDFExtractionStrategy.parse_pdf(file_path) > return DocumentData(toc, pages = pages) > > > class DocumentDataExtractor: > """ > Main class of the module. > It's responsible for actually executing the text extraction. > The output is constituted by the following items: > -table of contents (TOC); > -pages contents. > """ > > # Dictionary containing the extraction strategies for the different > # document types, indexed by the corresponding DocumentType enum values. > strategies = { > DocumentType.DOCX : DOCXExtractionStrategy(), > DocumentType.PDF : PDFExtractionStrategy() > } > > @classmethod > def extract_document_data(cls, file_path : str) -> DocumentData: > """ > Entry point of the module, it extracts the data from the document > whose path is passed as input. > The extraction strategy is automatically chosen based on the MIME > type > of the file. > > @type file_path: str > @param file_path: The path of the document to be parsed. > @rtype: DocumentData > @returns: An object containing the data of the parsed document. > """ > > mime = magic.Magic(mime=True) > mime_type = mime.from_file(file_path) > document_type = DocumentType.get_instance(mime_type) > strategy = cls.strategies[document_type] > return strategy.extract_document_data(file_path) -- https://mail.python.org/mailman/listinfo/python-list