tempgen.parsers.docx

View Source

import docx
from tempgen.parsers.parser import AbstractParser

class Parser(AbstractParser):
    def paragraph_replace_text(self, paragraph, str, replace_str):
        '''
        https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471
        '''
        count = 0
        search_pos = 0
        while paragraph.text.find(str, search_pos) != -1:
            match = { 'start': paragraph.text.find(str, search_pos), 'end': paragraph.text.find(str, search_pos) + len(str) }
            search_pos = match['end']
            padding = (len(replace_str) - (match['end'] -match['start']) ) *count
            runs = iter(paragraph.runs)
            start, end = match['start'] + padding , match['end'] + padding
            for run in runs:
                run_len = len(run.text)
                if start < run_len:
                    break
                start, end = start - run_len, end - run_len
            run_text = run.text
            run_len = len(run_text)
            run.text = '%s%s%s' % (run_text[:start], replace_str, run_text[end:])
            end -= run_len
            for run in runs:
                if end <= 0:
                    break
                run_text = run.text
                run_len = len(run_text)
                run.text = run_text[end:]
                end -= run_len
            count += 1
        return paragraph

    def replace_in_paragraph(self, p, d):
        for replaced, replacement in d.items():
            self.paragraph_replace_text(p, replaced, replacement)

    def collect_paragraphs(self, doc):
        paragraphs = []
        for p in doc.paragraphs:
                paragraphs.append(p)
        for table in doc.tables:
            for col in table.columns:
                for cell in col.cells:
                    for p in cell.paragraphs:
                        paragraphs.append(p)
        return paragraphs

    def parse(self, path, container, parse_entry, find_matches):
        doc = docx.Document(path)
        paragraphs = self.collect_paragraphs(doc)
        for p in paragraphs:
            matches = find_matches(p.text)
            for match in matches:
                payload = parse_entry(match, path)
                container[payload['id']] = payload

    def replace(self, source_path, target_path, compute_match, replacements, update_external = False):
        doc = docx.Document(source_path)
        paragraphs = self.collect_paragraphs(doc)
        to_replace = {}
        for p in paragraphs:
            compute_match(p.text, to_replace, replacements, source_path, update_external)
            self.replace_in_paragraph(p, to_replace)
        doc.save(target_path)

# class Parser(tempgen.parsers.parser.AbstractParser):

View Source

class Parser(AbstractParser):
    def paragraph_replace_text(self, paragraph, str, replace_str):
        '''
        https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471
        '''
        count = 0
        search_pos = 0
        while paragraph.text.find(str, search_pos) != -1:
            match = { 'start': paragraph.text.find(str, search_pos), 'end': paragraph.text.find(str, search_pos) + len(str) }
            search_pos = match['end']
            padding = (len(replace_str) - (match['end'] -match['start']) ) *count
            runs = iter(paragraph.runs)
            start, end = match['start'] + padding , match['end'] + padding
            for run in runs:
                run_len = len(run.text)
                if start < run_len:
                    break
                start, end = start - run_len, end - run_len
            run_text = run.text
            run_len = len(run_text)
            run.text = '%s%s%s' % (run_text[:start], replace_str, run_text[end:])
            end -= run_len
            for run in runs:
                if end <= 0:
                    break
                run_text = run.text
                run_len = len(run_text)
                run.text = run_text[end:]
                end -= run_len
            count += 1
        return paragraph

    def replace_in_paragraph(self, p, d):
        for replaced, replacement in d.items():
            self.paragraph_replace_text(p, replaced, replacement)

    def collect_paragraphs(self, doc):
        paragraphs = []
        for p in doc.paragraphs:
                paragraphs.append(p)
        for table in doc.tables:
            for col in table.columns:
                for cell in col.cells:
                    for p in cell.paragraphs:
                        paragraphs.append(p)
        return paragraphs

    def parse(self, path, container, parse_entry, find_matches):
        doc = docx.Document(path)
        paragraphs = self.collect_paragraphs(doc)
        for p in paragraphs:
            matches = find_matches(p.text)
            for match in matches:
                payload = parse_entry(match, path)
                container[payload['id']] = payload

    def replace(self, source_path, target_path, compute_match, replacements, update_external = False):
        doc = docx.Document(source_path)
        paragraphs = self.collect_paragraphs(doc)
        to_replace = {}
        for p in paragraphs:
            compute_match(p.text, to_replace, replacements, source_path, update_external)
            self.replace_in_paragraph(p, to_replace)
        doc.save(target_path)

Helper class that provides a standard way to create an ABC using inheritance.

# Parser()

# def paragraph_replace_text(self, paragraph, str, replace_str):

View Source

    def paragraph_replace_text(self, paragraph, str, replace_str):
        '''
        https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471
        '''
        count = 0
        search_pos = 0
        while paragraph.text.find(str, search_pos) != -1:
            match = { 'start': paragraph.text.find(str, search_pos), 'end': paragraph.text.find(str, search_pos) + len(str) }
            search_pos = match['end']
            padding = (len(replace_str) - (match['end'] -match['start']) ) *count
            runs = iter(paragraph.runs)
            start, end = match['start'] + padding , match['end'] + padding
            for run in runs:
                run_len = len(run.text)
                if start < run_len:
                    break
                start, end = start - run_len, end - run_len
            run_text = run.text
            run_len = len(run_text)
            run.text = '%s%s%s' % (run_text[:start], replace_str, run_text[end:])
            end -= run_len
            for run in runs:
                if end <= 0:
                    break
                run_text = run.text
                run_len = len(run_text)
                run.text = run_text[end:]
                end -= run_len
            count += 1
        return paragraph

https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471

# def replace_in_paragraph(self, p, d):

View Source

    def replace_in_paragraph(self, p, d):
        for replaced, replacement in d.items():
            self.paragraph_replace_text(p, replaced, replacement)

# def collect_paragraphs(self, doc):

View Source

    def collect_paragraphs(self, doc):
        paragraphs = []
        for p in doc.paragraphs:
                paragraphs.append(p)
        for table in doc.tables:
            for col in table.columns:
                for cell in col.cells:
                    for p in cell.paragraphs:
                        paragraphs.append(p)
        return paragraphs

# def parse(self, path, container, parse_entry, find_matches):

View Source

    def parse(self, path, container, parse_entry, find_matches):
        doc = docx.Document(path)
        paragraphs = self.collect_paragraphs(doc)
        for p in paragraphs:
            matches = find_matches(p.text)
            for match in matches:
                payload = parse_entry(match, path)
                container[payload['id']] = payload

Parse file accessible via path property

A general implementation of parse method should include following steps:

Open file
Read file data and transform it's meaningful content into string or an iterable of strings
Call of find_matches function on such strings, resulting in an array of matches
For each match found one should call parse_entry, resulting in an entry dictionary
For each entry use entry "id" property as key and payload as value to populate the container provided

Parameters

path (str): Absolute path to file to be parsed
container (Dict[str, [Dict[str, Any]]]): Dictionary to be populated with parsed entries, contains key-value pairs with entry id property as key and entry payload dictionary as value
parse_entry (callable): Function that extracts entry (current implementation uses json parse) from matching string, returns entry payload dictionary
find_matches (callable): Function that searches the entry string for matches (that is, {{VALID_JSON_OBJECT}} patterns), returns array of matching substrings

# def replace( self, source_path, target_path, compute_match, replacements, update_external=False ):

View Source

    def replace(self, source_path, target_path, compute_match, replacements, update_external = False):
        doc = docx.Document(source_path)
        paragraphs = self.collect_paragraphs(doc)
        to_replace = {}
        for p in paragraphs:
            compute_match(p.text, to_replace, replacements, source_path, update_external)
            self.replace_in_paragraph(p, to_replace)
        doc.save(target_path)

Replace file contents

A general implementation of replace method should include following steps:

Open file accessible via source_path in read mode
Read file data and transform it so its text content becomes available for editing
For each string in obtained text call compute_match, it results in an dictionary with {{VALID_JSON_OBJECT}} patterns as keys and computed substitutions as values
For each match, value in the dictionary, replace match with value in text
Create file at target_path and write modified text

Parameters

source_path (str): Absolute path to file to be parsed
target_path (str): Absolute path to file to be generated
compute_match (callable): Function that
1. searches the entry string for matches (that is, {{VALID_JSON_OBJECT}} patterns)
2. finds entry id in replacements dictionary
3. populates to_replace dictionary parameter with "{{VALID_JSON_OBJECT}}" as key and replacement string as value
4. if update_external is True, it updates external resources with replacement string
5. returns to_replace dictionary
replacements (Dict[str, str]): Dictionary containing pairs of "{{VALID_JSON_OBJECT}}" keys and their replacements as values
update_external (bool, optional): Boolean, indicating whether external resources should be updated