tempgen.parsers.docx
View Source
import docx from tempgen.parsers.parser import AbstractParser class Parser(AbstractParser): def paragraph_replace_text(self, paragraph, str, replace_str): ''' https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471 ''' count = 0 search_pos = 0 while paragraph.text.find(str, search_pos) != -1: match = { 'start': paragraph.text.find(str, search_pos), 'end': paragraph.text.find(str, search_pos) + len(str) } search_pos = match['end'] padding = (len(replace_str) - (match['end'] -match['start']) ) *count runs = iter(paragraph.runs) start, end = match['start'] + padding , match['end'] + padding for run in runs: run_len = len(run.text) if start < run_len: break start, end = start - run_len, end - run_len run_text = run.text run_len = len(run_text) run.text = '%s%s%s' % (run_text[:start], replace_str, run_text[end:]) end -= run_len for run in runs: if end <= 0: break run_text = run.text run_len = len(run_text) run.text = run_text[end:] end -= run_len count += 1 return paragraph def replace_in_paragraph(self, p, d): for replaced, replacement in d.items(): self.paragraph_replace_text(p, replaced, replacement) def collect_paragraphs(self, doc): paragraphs = [] for p in doc.paragraphs: paragraphs.append(p) for table in doc.tables: for col in table.columns: for cell in col.cells: for p in cell.paragraphs: paragraphs.append(p) return paragraphs def parse(self, path, container, parse_entry, find_matches): doc = docx.Document(path) paragraphs = self.collect_paragraphs(doc) for p in paragraphs: matches = find_matches(p.text) for match in matches: payload = parse_entry(match, path) container[payload['id']] = payload def replace(self, source_path, target_path, compute_match, replacements, update_external = False): doc = docx.Document(source_path) paragraphs = self.collect_paragraphs(doc) to_replace = {} for p in paragraphs: compute_match(p.text, to_replace, replacements, source_path, update_external) self.replace_in_paragraph(p, to_replace) doc.save(target_path)
View Source
class Parser(AbstractParser): def paragraph_replace_text(self, paragraph, str, replace_str): ''' https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471 ''' count = 0 search_pos = 0 while paragraph.text.find(str, search_pos) != -1: match = { 'start': paragraph.text.find(str, search_pos), 'end': paragraph.text.find(str, search_pos) + len(str) } search_pos = match['end'] padding = (len(replace_str) - (match['end'] -match['start']) ) *count runs = iter(paragraph.runs) start, end = match['start'] + padding , match['end'] + padding for run in runs: run_len = len(run.text) if start < run_len: break start, end = start - run_len, end - run_len run_text = run.text run_len = len(run_text) run.text = '%s%s%s' % (run_text[:start], replace_str, run_text[end:]) end -= run_len for run in runs: if end <= 0: break run_text = run.text run_len = len(run_text) run.text = run_text[end:] end -= run_len count += 1 return paragraph def replace_in_paragraph(self, p, d): for replaced, replacement in d.items(): self.paragraph_replace_text(p, replaced, replacement) def collect_paragraphs(self, doc): paragraphs = [] for p in doc.paragraphs: paragraphs.append(p) for table in doc.tables: for col in table.columns: for cell in col.cells: for p in cell.paragraphs: paragraphs.append(p) return paragraphs def parse(self, path, container, parse_entry, find_matches): doc = docx.Document(path) paragraphs = self.collect_paragraphs(doc) for p in paragraphs: matches = find_matches(p.text) for match in matches: payload = parse_entry(match, path) container[payload['id']] = payload def replace(self, source_path, target_path, compute_match, replacements, update_external = False): doc = docx.Document(source_path) paragraphs = self.collect_paragraphs(doc) to_replace = {} for p in paragraphs: compute_match(p.text, to_replace, replacements, source_path, update_external) self.replace_in_paragraph(p, to_replace) doc.save(target_path)
Helper class that provides a standard way to create an ABC using inheritance.
View Source
def paragraph_replace_text(self, paragraph, str, replace_str): ''' https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471 ''' count = 0 search_pos = 0 while paragraph.text.find(str, search_pos) != -1: match = { 'start': paragraph.text.find(str, search_pos), 'end': paragraph.text.find(str, search_pos) + len(str) } search_pos = match['end'] padding = (len(replace_str) - (match['end'] -match['start']) ) *count runs = iter(paragraph.runs) start, end = match['start'] + padding , match['end'] + padding for run in runs: run_len = len(run.text) if start < run_len: break start, end = start - run_len, end - run_len run_text = run.text run_len = len(run_text) run.text = '%s%s%s' % (run_text[:start], replace_str, run_text[end:]) end -= run_len for run in runs: if end <= 0: break run_text = run.text run_len = len(run_text) run.text = run_text[end:] end -= run_len count += 1 return paragraph
https://github.com/python-openxml/python-docx/issues/30#issuecomment-881106471
View Source
def replace_in_paragraph(self, p, d): for replaced, replacement in d.items(): self.paragraph_replace_text(p, replaced, replacement)
View Source
def collect_paragraphs(self, doc): paragraphs = [] for p in doc.paragraphs: paragraphs.append(p) for table in doc.tables: for col in table.columns: for cell in col.cells: for p in cell.paragraphs: paragraphs.append(p) return paragraphs
View Source
def parse(self, path, container, parse_entry, find_matches): doc = docx.Document(path) paragraphs = self.collect_paragraphs(doc) for p in paragraphs: matches = find_matches(p.text) for match in matches: payload = parse_entry(match, path) container[payload['id']] = payload
Parse file accessible via path property
A general implementation of parse method should include following steps:
- Open file
- Read file data and transform it's meaningful content into string or an iterable of strings
- Call of find_matches function on such strings, resulting in an array of matches
- For each match found one should call parse_entry, resulting in an entry dictionary
- For each entry use entry "id" property as key and payload as value to populate the container provided
Parameters
- path (str): Absolute path to file to be parsed
- container (Dict[str, [Dict[str, Any]]]): Dictionary to be populated with parsed entries, contains key-value pairs with entry id property as key and entry payload dictionary as value
- parse_entry (callable): Function that extracts entry (current implementation uses json parse) from matching string, returns entry payload dictionary
- find_matches (callable): Function that searches the entry string for matches (that is, {{VALID_JSON_OBJECT}} patterns), returns array of matching substrings
#  
def
replace(
self,
source_path,
target_path,
compute_match,
replacements,
update_external=False
):
View Source
def replace(self, source_path, target_path, compute_match, replacements, update_external = False): doc = docx.Document(source_path) paragraphs = self.collect_paragraphs(doc) to_replace = {} for p in paragraphs: compute_match(p.text, to_replace, replacements, source_path, update_external) self.replace_in_paragraph(p, to_replace) doc.save(target_path)
Replace file contents
A general implementation of replace method should include following steps:
- Open file accessible via source_path in read mode
- Read file data and transform it so its text content becomes available for editing
- For each string in obtained text call compute_match, it results in an dictionary with {{VALID_JSON_OBJECT}} patterns as keys and computed substitutions as values
- For each match, value in the dictionary, replace match with value in text
- Create file at target_path and write modified text
Parameters
- source_path (str): Absolute path to file to be parsed
- target_path (str): Absolute path to file to be generated
- compute_match (callable):
Function that
- searches the entry string for matches (that is, {{VALID_JSON_OBJECT}} patterns)
- finds entry id in replacements dictionary
- populates to_replace dictionary parameter with "{{VALID_JSON_OBJECT}}" as key and replacement string as value
- if update_external is True, it updates external resources with replacement string
- returns to_replace dictionary
- replacements (Dict[str, str]): Dictionary containing pairs of "{{VALID_JSON_OBJECT}}" keys and their replacements as values
- update_external (bool, optional): Boolean, indicating whether external resources should be updated