Computing System - コンパイラ#1:構文解析(トークナイザー、字句解析、字句要素)

開発環境

OS X El Capitan - Apple (OS)
Emacs(Text Editor)
Java (実行環境)

コンピュータシステムの理論と実装 (Noam Nisan (著)、Shimon Schocken (著)、斎藤康毅(翻訳)、オライリージャパン)の10章(コンパイラ#1:構文解析)、10.5(プロジェクト)、10.5.2(第1段階: トークナイザー)を取り組んでみる。

10.5(プロジェクト)、10.5.2(第1段階: トークナイザー)

コード(Emacs)

JackAnalyzer.py


  #!/usr/bin/env python3
# -*- coding: utf-8 -*-


import os
import glob
import sys
import re

types = {'KEYWORD': 'keyword', 'SYMBOL': 'symbol', 'IDENTIFIER': 'identifier',
         'INT_CONST': 'integerConstant', 'STRING_CONST': 'stringConstant'}


class JackTokenizer:

    def __init__(self, file):
        self.file = file
        self.next_ch = ''
        self.cur_token_type = ''
        self.cur_token = ''
        self.next_token_type = ''
        self.next_token = self.get_next_token()

    def get_next_token(self):
        token = ''
        if self.next_ch != '':
            c = self.next_ch
            self.next_ch = ''
        else:
            c = self.file.read(1)
        while re.match('\s', c):
            c = self.file.read(1)
        if c == '':
            return ''
        while True:
            if re.match('\s', c):
                return self.get_next_token()
            if c == '/':
                token += c
                c = self.file.read(1)
                if c == '/':
                    self.file.readline()
                    return self.get_next_token()
                if c == '*':
                    while True:
                        c = self.file.read(1)
                        if c == '*':
                            c = self.file.read(1)
                            if c == '/':
                                break
                    return self.get_next_token()
                self.next_ch = c
                self.next_token_type = 'SYMBOL'
                return token
            if re.match(r'[-{}()\[\].,;+*/&|<>=~]', c):
                token = c
                self.next_token_type = 'SYMBOL'
                return token
            if re.match(r'\d', c):
                token = c
                while True:
                    c = self.file.read(1)
                    if re.match(r'\d', c):
                        token += c
                    else:
                        self.next_ch = c
                        break
                self.next_token_type = 'INT_CONST'
                return token
            if c == '"':
                while True:
                    c = self.file.read(1)
                    if c == '"':
                        break
                    else:
                        token += c
                self.next_token_type = 'STRING_CONST'
                return token
            token = c
            while True:
                c = self.file.read(1)
                if re.match(r'[a-zA-Z0-9_]', c):
                    token += c
                else:
                    self.next_ch = c
                    break
            if token in ['class', 'constructor', 'function', 'method', 'field',
                         'static', 'var', 'int', 'char', 'boolean', 'void',
                         'true', 'false', 'null', 'this', 'let', 'do', 'if',
                         'else', 'while', 'return']:
                self.next_token_type = 'KEYWORD'
            else:
                self.next_token_type = 'IDENTIFIER'
            return token

    def has_more_tokens(self):
        return self.next_token != ''

    def advance(self):
        self.cur_token = self.next_token
        self.cur_token_type = self.next_token_type
        self.next_token = self.get_next_token()

    def token_type(self):
        return self.cur_token_type

    def keyword(self):
        return self.cur_token.upper()

    def symbol(self):
        return self.cur_token. \
            replace('&', '&'). \
            replace('<', '<'). \
            replace('>', '>')

    def identifier(self):
        return self.cur_token

    def int_val(self):
        return int(self.cur_token)

    def string_val(self):
        return self.cur_token


if __name__ == '__main__':
    source = sys.argv[1]
    filenames = []
    if os.path.isfile(source):
        filenames.append(source)
    elif os.path.isdir(source):
        filenames = glob.glob('{0}{1}*.jack'.format(source, os.path.sep))
    for filename in filenames:
        with open(filename) as inf, \
                open(filename.replace('.jack', '.xml'), 'w') as outf:
            tokenizer = JackTokenizer(inf)
            print('<tokens>', file=outf)
            while tokenizer.has_more_tokens():
                tokenizer.advance()
                t = tokenizer.token_type()
                token = ''
                if t == 'KEYWORD':
                    token = tokenizer.keyword().lower()
                elif t == 'SYMBOL':
                    token = tokenizer.symbol()
                elif t == 'IDENTIFIER':
                    token = tokenizer.identifier()
                elif t == 'INT_CONST':
                    token = tokenizer.int_val()
                elif t == 'STRING_CONST':
                    token = tokenizer.string_val()
                print('<{0}> {1} </{0}>'.format(
                    types[tokenizer.token_type()], token),
                    file=outf)
            print('</tokens>', file=outf)

入出力結果(Terminal, IPython)

$ make
rm -f Square/*.xml
./JackAnalyzer.py Square
./JackAnalyzer.py ArrayTest
cat Square/Main.xml
<tokens>
<keyword> class </keyword>
<identifier> Main </identifier>
<symbol> { </symbol>
<keyword> function </keyword>
<keyword> void </keyword>
<identifier> main </identifier>
<symbol> ( </symbol>
<symbol> ) </symbol>
<symbol> { </symbol>
<keyword> var </keyword>
<identifier> SquareGame </identifier>
<identifier> game </identifier>
<symbol> ; </symbol>
<keyword> let </keyword>
<identifier> game </identifier>
<symbol> = </symbol>
<identifier> SquareGame </identifier>
<symbol> . </symbol>
<identifier> new </identifier>
<symbol> ( </symbol>
<symbol> ) </symbol>
<symbol> ; </symbol>
<keyword> do </keyword>
<identifier> game </identifier>
<symbol> . </symbol>
<identifier> run </identifier>
<symbol> ( </symbol>
<symbol> ) </symbol>
<symbol> ; </symbol>
<keyword> do </keyword>
<identifier> game </identifier>
<symbol> . </symbol>
<identifier> dispose </identifier>
<symbol> ( </symbol>
<symbol> ) </symbol>
<symbol> ; </symbol>
<keyword> return </keyword>
<symbol> ; </symbol>
<symbol> } </symbol>
<symbol> } </symbol>
</tokens>
./TextComparer Square/Main.xml test_xml/Square/MainT.xml
Comparison ended successfully
./TextComparer Square/Square.xml test_xml/Square/SquareT.xml
Comparison ended successfully
./TextComparer Square/SquareGame.xml test_xml/Square/SquareGameT.xml
Comparison ended successfully
rm -f ArrayTest/*.xml
./JackAnalyzer.py ArrayTest
./TextComparer ArrayTest/Main.xml test_xml/ArrayTest/MainT.xml
Comparison ended successfully
$

Kamimura's blog

2016年2月17日水曜日

Computing System - コンパイラ#1:構文解析(トークナイザー、字句解析、字句要素)

0 コメント:

コメントを投稿