tempgen.parsers.docx

View Source
import docx
from tempgen.parsers.parser import AbstractParser

class Parser(AbstractParser):
    def paragraph_replace_text(self, paragraph, str, replace_str):
        '''
        https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471
        '''
        count = 0
        search_pos = 0
        while paragraph.text.find(str, search_pos) != -1:
            match = { 'start': paragraph.text.find(str, search_pos), 'end': paragraph.text.find(str, search_pos) + len(str) }
            search_pos = match['end']
            padding = (len(replace_str) - (match['end'] -match['start']) ) *count
            runs = iter(paragraph.runs)
            start, end = match['start'] + padding , match['end'] + padding
            for run in runs:
                run_len = len(run.text)
                if start < run_len:
                    break
                start, end = start - run_len, end - run_len
            run_text = run.text
            run_len = len(run_text)
            run.text = '%s%s%s' % (run_text[:start], replace_str, run_text[end:])
            end -= run_len
            for run in runs:
                if end <= 0:
                    break
                run_text = run.text
                run_len = len(run_text)
                run.text = run_text[end:]
                end -= run_len
            count += 1
        return paragraph

    def replace_in_paragraph(self, p, d):
        for replaced, replacement in d.items():
            self.paragraph_replace_text(p, replaced, replacement)

    def collect_paragraphs(self, doc):
        paragraphs = []
        for p in doc.paragraphs:
                paragraphs.append(p)
        for table in doc.tables:
            for col in table.columns:
                for cell in col.cells:
                    for p in cell.paragraphs:
                        paragraphs.append(p)
        return paragraphs

    def parse(self, path, container, parse_entry, find_matches):
        doc = docx.Document(path)
        paragraphs = self.collect_paragraphs(doc)
        for p in paragraphs:
            matches = find_matches(p.text)
            for match in matches:
                payload = parse_entry(match, path)
                container[payload['id']] = payload

    def replace(self, source_path, target_path, compute_match, replacements, update_external = False):
        doc = docx.Document(source_path)
        paragraphs = self.collect_paragraphs(doc)
        to_replace = {}
        for p in paragraphs:
            compute_match(p.text, to_replace, replacements, source_path, update_external)
            self.replace_in_paragraph(p, to_replace)
        doc.save(target_path)
View Source
class Parser(AbstractParser):
    def paragraph_replace_text(self, paragraph, str, replace_str):
        '''
        https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471
        '''
        count = 0
        search_pos = 0
        while paragraph.text.find(str, search_pos) != -1:
            match = { 'start': paragraph.text.find(str, search_pos), 'end': paragraph.text.find(str, search_pos) + len(str) }
            search_pos = match['end']
            padding = (len(replace_str) - (match['end'] -match['start']) ) *count
            runs = iter(paragraph.runs)
            start, end = match['start'] + padding , match['end'] + padding
            for run in runs:
                run_len = len(run.text)
                if start < run_len:
                    break
                start, end = start - run_len, end - run_len
            run_text = run.text
            run_len = len(run_text)
            run.text = '%s%s%s' % (run_text[:start], replace_str, run_text[end:])
            end -= run_len
            for run in runs:
                if end <= 0:
                    break
                run_text = run.text
                run_len = len(run_text)
                run.text = run_text[end:]
                end -= run_len
            count += 1
        return paragraph

    def replace_in_paragraph(self, p, d):
        for replaced, replacement in d.items():
            self.paragraph_replace_text(p, replaced, replacement)

    def collect_paragraphs(self, doc):
        paragraphs = []
        for p in doc.paragraphs:
                paragraphs.append(p)
        for table in doc.tables:
            for col in table.columns:
                for cell in col.cells:
                    for p in cell.paragraphs:
                        paragraphs.append(p)
        return paragraphs

    def parse(self, path, container, parse_entry, find_matches):
        doc = docx.Document(path)
        paragraphs = self.collect_paragraphs(doc)
        for p in paragraphs:
            matches = find_matches(p.text)
            for match in matches:
                payload = parse_entry(match, path)
                container[payload['id']] = payload

    def replace(self, source_path, target_path, compute_match, replacements, update_external = False):
        doc = docx.Document(source_path)
        paragraphs = self.collect_paragraphs(doc)
        to_replace = {}
        for p in paragraphs:
            compute_match(p.text, to_replace, replacements, source_path, update_external)
            self.replace_in_paragraph(p, to_replace)
        doc.save(target_path)

Helper class that provides a standard way to create an ABC using inheritance.

#   Parser()
#   def paragraph_replace_text(self, paragraph, str, replace_str):
View Source
    def paragraph_replace_text(self, paragraph, str, replace_str):
        '''
        https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471
        '''
        count = 0
        search_pos = 0
        while paragraph.text.find(str, search_pos) != -1:
            match = { 'start': paragraph.text.find(str, search_pos), 'end': paragraph.text.find(str, search_pos) + len(str) }
            search_pos = match['end']
            padding = (len(replace_str) - (match['end'] -match['start']) ) *count
            runs = iter(paragraph.runs)
            start, end = match['start'] + padding , match['end'] + padding
            for run in runs:
                run_len = len(run.text)
                if start < run_len:
                    break
                start, end = start - run_len, end - run_len
            run_text = run.text
            run_len = len(run_text)
            run.text = '%s%s%s' % (run_text[:start], replace_str, run_text[end:])
            end -= run_len
            for run in runs:
                if end <= 0:
                    break
                run_text = run.text
                run_len = len(run_text)
                run.text = run_text[end:]
                end -= run_len
            count += 1
        return paragraph

https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471

#   def replace_in_paragraph(self, p, d):
View Source
    def replace_in_paragraph(self, p, d):
        for replaced, replacement in d.items():
            self.paragraph_replace_text(p, replaced, replacement)
#   def collect_paragraphs(self, doc):
View Source
    def collect_paragraphs(self, doc):
        paragraphs = []
        for p in doc.paragraphs:
                paragraphs.append(p)
        for table in doc.tables:
            for col in table.columns:
                for cell in col.cells:
                    for p in cell.paragraphs:
                        paragraphs.append(p)
        return paragraphs
#   def parse(self, path, container, parse_entry, find_matches):
View Source
    def parse(self, path, container, parse_entry, find_matches):
        doc = docx.Document(path)
        paragraphs = self.collect_paragraphs(doc)
        for p in paragraphs:
            matches = find_matches(p.text)
            for match in matches:
                payload = parse_entry(match, path)
                container[payload['id']] = payload

Parse file accessible via path property

A general implementation of parse method should include following steps:

  1. Open file
  2. Read file data and transform it's meaningful content into string or an iterable of strings
  3. Call of find_matches function on such strings, resulting in an array of matches
  4. For each match found one should call parse_entry, resulting in an entry dictionary
  5. For each entry use entry "id" property as key and payload as value to populate the container provided
Parameters
  • path (str): Absolute path to file to be parsed
  • container (Dict[str, [Dict[str, Any]]]): Dictionary to be populated with parsed entries, contains key-value pairs with entry id property as key and entry payload dictionary as value
  • parse_entry (callable): Function that extracts entry (current implementation uses json parse) from matching string, returns entry payload dictionary
  • find_matches (callable): Function that searches the entry string for matches (that is, {{VALID_JSON_OBJECT}} patterns), returns array of matching substrings
#   def replace( self, source_path, target_path, compute_match, replacements, update_external=False ):
View Source
    def replace(self, source_path, target_path, compute_match, replacements, update_external = False):
        doc = docx.Document(source_path)
        paragraphs = self.collect_paragraphs(doc)
        to_replace = {}
        for p in paragraphs:
            compute_match(p.text, to_replace, replacements, source_path, update_external)
            self.replace_in_paragraph(p, to_replace)
        doc.save(target_path)

Replace file contents

A general implementation of replace method should include following steps:

  1. Open file accessible via source_path in read mode
  2. Read file data and transform it so its text content becomes available for editing
  3. For each string in obtained text call compute_match, it results in an dictionary with {{VALID_JSON_OBJECT}} patterns as keys and computed substitutions as values
  4. For each match, value in the dictionary, replace match with value in text
  5. Create file at target_path and write modified text
Parameters
  • source_path (str): Absolute path to file to be parsed
  • target_path (str): Absolute path to file to be generated
  • compute_match (callable): Function that
    1. searches the entry string for matches (that is, {{VALID_JSON_OBJECT}} patterns)
    2. finds entry id in replacements dictionary
    3. populates to_replace dictionary parameter with "{{VALID_JSON_OBJECT}}" as key and replacement string as value
    4. if update_external is True, it updates external resources with replacement string
    5. returns to_replace dictionary
  • replacements (Dict[str, str]): Dictionary containing pairs of "{{VALID_JSON_OBJECT}}" keys and their replacements as values
  • update_external (bool, optional): Boolean, indicating whether external resources should be updated