Fri, 18 Sep 2020 20:22:22 +0300
added unit testing
import re import linetypes import datetime class Header: ''' Result type of header processing, this contains all the header information. ''' def __init__(self): self.description = None self.name = None self.author = None self.username = None self.filetype = None self.qualifiers = None self.license = None self.help = '' self.bfc = None self.category = None self.keywords = '' self.cmdline = None self.history = [] from collections import defaultdict self.occurrences = defaultdict(list) @property def valid(self): return True @property def effective_filetype(self): ''' What's the effective file type? The "Unofficial_" prefix is left out. ''' if self.filetype.startswith('Unofficial_'): return self.filetype.rsplit('Unofficial_')[1] else: return self.filetype @property def effective_category(self): ''' Returns the category of the part. Leading punctuation marks are ignored. ''' if self.category: return self.category else: import string category = self.description.split(' ', 1)[0] while category and category[0] in string.punctuation: category = category[1:] return category class BadHeader: ''' If header processing fails this object is returned as the resulting header instead. It contains the details of where the header could not be understood and why. ''' def __init__(self, index, reason): self.index = index self.reason = reason def __repr__(self): return str.format( 'header.BadHeader(index = {index!r}, reason = {reason!r})', index = self.index, reason = self.reason, ) @property def valid(self): return False def is_invertnext(entry): return isinstance(entry, linetypes.MetaCommand) \ and entry.text == "BFC INVERTNEXT" def is_suitable_header_object(entry): ''' Is the given object something that we can consider to be part of the header? ''' if is_invertnext(entry): # It's BFC INVERTNEXT, that's not a header command. return False # Check if it's one of the functional linetypes return not any( isinstance(entry, linetype) for linetype in [ linetypes.SubfileReference, linetypes.LineSegment, linetypes.Triangle, linetypes.Quadrilateral, linetypes.ConditionalLine, linetypes.Comment, linetypes.Error, ] ) class HeaderError(Exception): ''' An error raised during header parsing ''' def __init__(self, index, reason): self.index, self.reason = index, reason def __repr__(self): return str.format( 'HeaderError({index!r}, {reason!r})', index = self.index, reason = self.reason, ) def __str__(self): return reason class HistoryEntry: ''' Represents a single !HISTORY entry ''' def __init__(self, date, user, text): self.date, self.user, self.text = date, user, text def __repr__(self): return str.format( 'HistoryEntry({date!r}, {user!r}, {text!r})', date = self.date, user = self.user, text = self.text) class HeaderParser: def __init__(self): self.model_body = None self.cursor = 0 self.problems = [] def parse(self, model_body): result = Header() self.result = result self.order = [] self.cursor = -1 self.model_body = model_body self.skip_to_next() result.description = self.current() self.skip_to_next() result.name = self.parse_pattern(r'^Name: (.+)$', 'name')[0] self.skip_to_next() # Parse author line result.author, result.username = self.parse_pattern(r'^Author: (?:([^\[]+))?(?:\[([^\]]+)\])?', 'author') if isinstance(result.author, str): # clean leading spaces result.author = str.strip(result.author) if not result.author and not result.username: self.parse_error('author line does not contain a name nor username') # use more patterns to parse the rest of the header for header_entry in self.get_more_header_stuff(): if self.try_to_match( r'^!LDRAW_ORG ' \ r'((?:Unofficial_)?(?:' \ r'Part|' \ r'Subpart|' \ r'Primitive|' \ r'8_Primitive|' \ r'48_Primitive|' \ r'Shortcut' \ r'))\s?' \ r'(.*)$', 'part type'): result.filetype = self.groups[0] result.qualifiers = re.findall(r'(?:Physical_Colour|Alias|ORIGINAL|UPDATE \d\d\d\d-\d\d)', self.groups[1]) elif self.try_to_match( r'^!LICENSE (.+)$', 'license'): result.license = self.groups[0] elif self.try_to_match( r'BFC (CERTIFY CW|CERTIFY CCW|NOCERTIFY)', 'bfc'): result.bfc = self.groups[0] elif self.try_to_match( r'!HISTORY (\d{4}-\d{2}-\d{2}) ([\[{][^\]}]+[\]}]) (.+)$', 'history'): try: time_object = datetime.datetime.strptime( self.groups[0], '%Y-%m-%d', ) except ValueError: self.parse_error("invalid ISO date in history") result.history.append(HistoryEntry( date = time_object.date(), user = self.groups[1], text = self.groups[2], )) elif self.try_to_match( r'!HELP (.+)', 'help'): if result.help: result.help += '\n' result.help += self.groups[0] elif self.try_to_match( r'!CATEGORY (.+)', 'category'): result.category = self.groups[0] elif self.try_to_match( r'!KEYWORDS (.+)', 'keywords'): if result.keywords: result.keywords += '\n' result.keywords += self.groups[0] elif self.try_to_match( r'!CMDLINE (.+)', 'cmdline'): result.cmdline = self.groups[0] else: self.cursor -= 1 break if not result.filetype: self.parse_error('LDRAW_ORG line is missing') return { 'header': result, 'end-index': self.cursor + 1, # record where the header ended } def parse_error(self, message): raise HeaderError(index = self.cursor, reason = message) def get_more_header_stuff(self): ''' Iterates through the header and yields metacommand entries one after the other. ''' self.cursor += 1 new_cursor = self.cursor while new_cursor < len(self.model_body): entry = self.model_body[new_cursor] if not is_suitable_header_object(entry): # looks like the header ended break if isinstance(entry, linetypes.MetaCommand): self.cursor = new_cursor yield entry new_cursor += 1 def skip_to_next(self, *, spaces_expected = 0): ''' Skip to the next header line. ''' while True: if self.cursor + 1 >= len(self.model_body): # wound up past the end of model self.parse_error('file does not have a proper header') self.cursor += 1 entry = self.model_body[self.cursor] if not is_suitable_header_object(entry): self.parse_error('header is incomplete') if isinstance(entry, linetypes.MetaCommand): return def try_to_match(self, pattern, patterntype): ''' Tries to parse the specified pattern and to store the groups in self.groups. Returns whether or not this succeeded. ''' try: self.groups = self.parse_pattern(pattern, patterntype) return True except: return False def current(self): ''' Returns the text of the header line we're currently processing. ''' entry = self.model_body[self.cursor] assert isinstance(entry, linetypes.MetaCommand) return entry.text def parse_pattern(self, pattern, description): ''' Matches the current header line against the specified pattern. If not, raises an exception. See try_to_match for a softer wrapper that does not raise exceptions. ''' match = re.search(pattern, self.current()) if match: self.order.append(description) list.append(self.result.occurrences[description], self.cursor) return match.groups() else: self.parse_error(str.format("couldn't parse {}", description))