Skip to content

C++实现编译器(7): 跳转指令

参照《Writing An Interpreter/Compiler In Go》,改用C++实现。

项目源码: https://github.com/LeslieZhu/monkey-cpp

引言

本篇对应的源码位于目录: src/06/

src
 |06
 | |token
 | | |token.hpp
 | |evaluator
 | | |evaluator.hpp
 | | |builtins.hpp
 | |CMakeLists.txt
 | |test
 | | |lexer_test.hpp
 | | |parser_test.hpp
 | | |evaluator_test.hpp
 | | |vm_test.hpp
 | | |objects_test.hpp
 | | |ast_test.hpp
 | | |code_test.hpp
 | | |compiler_test.hpp
 | | |main.cpp
 | |lexer
 | | |lexer.hpp
 | |repl
 | | |repl.hpp
 | |code
 | | |code.hpp
 | |objects
 | | |objects.hpp
 | | |environment.hpp
 | |parser
 | | |parser.hpp
 | | |parser_tracing.hpp
 | |vm
 | | |vm.hpp
 | |ast
 | | |ast.hpp
 | |main
 | | |monkey.cpp
 | |compiler
 | | |compiler.hpp

之前字节码和编译器已经支持整数加法运算,现在添加对表达式和条件语句的支持,会涉及到编译顺序的调整,执行条件语句的跳转指令等。

出栈操作码:OpPop

栈顶的值使用后不再继续留在调用栈中,添加出栈操作码:

std::shared_ptr<objects::Error> Compile([[maybe_unused]] std::shared_ptr<ast::Node> node)
{
    //...
    else if(node->GetNodeType() == ast::NodeType::ExpressionStatement)
    {
      std::shared_ptr<ast::ExpressionStatement> exprStmt = std::dynamic_pointer_cast<ast::ExpressionStatement>(node);

      auto resultObj = Compile(exprStmt->pExpression);
      if (evaluator::isError(resultObj))
      {
          return resultObj;
      }
      emit(bytecode::OpcodeType::OpPop, {});
    }
    //...
}

在每个表达式语句后执行自动出栈操作,防止执行结果继续停留在栈顶。

中缀表达式

8种中缀表达式中和算术运算相关的是: +,-,*,/,在加法基础上补充即可。

编译器部分:

else if(node->GetNodeType() == ast::NodeType::InfixExpression)
{
    // ...

    if (infixObj->Operator == "+")
    {
        emit(bytecode::OpcodeType::OpAdd, {});
    } 
    else if (infixObj->Operator == "-")
    {
        emit(bytecode::OpcodeType::OpSub, {});
    }
    else if (infixObj->Operator == "*")
    {
        emit(bytecode::OpcodeType::OpMul, {});
    }
    else if (infixObj->Operator == "/")
    {
        emit(bytecode::OpcodeType::OpDiv, {});
    }
    else {
        return evaluator::newError("unknow operator: " + infixObj->Operator);
    }
}

虚拟机部分:

std::shared_ptr<objects::Object> executeBInaryIntegerOperaction(bytecode::OpcodeType op,
                                                                std::shared_ptr<objects::Object> left,
                                                                std::shared_ptr<objects::Object> right)
{
    auto rightObj = std::dynamic_pointer_cast<objects::Integer>(right);
    auto leftObj = std::dynamic_pointer_cast<objects::Integer>(left);

    int64_t result = 0;

    switch (op)
    {
    case bytecode::OpcodeType::OpAdd:
        result = leftObj->Value + rightObj->Value;
        break;
    case bytecode::OpcodeType::OpSub:
        result = leftObj->Value - rightObj->Value;
        break;
    case bytecode::OpcodeType::OpMul:
        result = leftObj->Value * rightObj->Value;
        break;
    case bytecode::OpcodeType::OpDiv:
        result = leftObj->Value / rightObj->Value;
        break;

    default:
        return evaluator::newError("unknow integer operator: " + bytecode::OpcodeTypeStr(op));
        break;
    }

    return Push(std::make_shared<objects::Integer>(result));
}

布尔类型

编译器部分:

else if(node->GetNodeType() == ast::NodeType::Boolean)
{
    auto boolAst = std::dynamic_pointer_cast<ast::Boolean>(node);
    if(boolAst->Value)
    {
        emit(bytecode::OpcodeType::OpTrue,{});
    } else {
        emit(bytecode::OpcodeType::OpFalse,{});
    }
}

虚拟机部分:

case bytecode::OpcodeType::OpTrue:
    {
        auto result = Push(objects::TRUE_OBJ);
        if(evaluator::isError(result))
        {
            return result;
        }
    }
    break;
case bytecode::OpcodeType::OpFalse:
    {
        auto result = Push(objects::FALSE_OBJ);
        if(evaluator::isError(result))
        {
            return result;
        }
    }
    break;

比较运算符

  • ==
  • !=
  • >
  • <
else if(node->GetNodeType() == ast::NodeType::InfixExpression)
{
    std::shared_ptr<ast::InfixExpression> infixObj = std::dynamic_pointer_cast<ast::InfixExpression>(node);

    if (infixObj->Operator == "<")
    {
        auto resultObj = Compile(infixObj->pRight);
        if (evaluator::isError(resultObj))
        {
            return resultObj;
        }

        resultObj = Compile(infixObj->pLeft);
        if (evaluator::isError(resultObj))
        {
            return resultObj;
        }


        emit(bytecode::OpcodeType::OpGreaterThan, {});

        return nullptr;
    }

    // ...

    else if (infixObj->Operator == ">")
    {
        emit(bytecode::OpcodeType::OpGreaterThan, {});
    }
    else if (infixObj->Operator == "==")
    {
        emit(bytecode::OpcodeType::OpEqual, {});
    }
    else if (infixObj->Operator == "!=")
    {
        emit(bytecode::OpcodeType::OpNotEqual, {});
    }
    else {
        return evaluator::newError("unknow operator: " + infixObj->Operator);
    }
}

4个运算符只需要定义3个操作码,通过调整编译顺序,可以将大于与小于都使用一个操作码完成。

前缀运算

  • !
  • -

编译器:

else if(node->GetNodeType() == ast::NodeType::PrefixExpression)
{
    auto prefixObj = std::dynamic_pointer_cast<ast::PrefixExpression>(node);
    auto resultObj = Compile(prefixObj->pRight);
    if (evaluator::isError(resultObj))
    {
        return resultObj;
    }

    if(prefixObj->Operator == "!")
    {
        emit(bytecode::OpcodeType::OpBang, {});
    }
    else if(prefixObj->Operator == "-")
    {
        emit(bytecode::OpcodeType::OpMinus, {});
    }
    else{
        return evaluator::newError("unknow operator: " + prefixObj->Operator);
    }
}

虚拟机:

std::shared_ptr<objects::Object> executeBangOperator()
{
    auto operand = Pop();

    if(operand == objects::TRUE_OBJ)
    {
        return Push(objects::FALSE_OBJ);
    }
    else if(operand == objects::FALSE_OBJ)
    {
        return Push(objects::TRUE_OBJ);
    }
    else
    {
        return Push(objects::FALSE_OBJ);
    }
}

std::shared_ptr<objects::Object> executeMinusOperator()
{
    auto operand = Pop();

    if(operand->Type() != objects::ObjectType::INTEGER)
    {
        return evaluator::newError("unsupported type for negation: " + operand->TypeStr());
    }
    auto integerObj = std::dynamic_pointer_cast<objects::Integer>(operand);
    return Push(std::make_shared<objects::Integer>(-1 * integerObj->Value));
}

跳转指令与偏移量

当AST已经在字节码中铺平后,字节码就是一系列的指令,它们只有先后关系,没有任何节点或递归;当需要根据条件语句来执行不同指令的时候,需要通过偏移量跳转指令来实现;而这里的偏移量可能是绝对偏移,也可能是相对偏移,如果编译的时候单次扫描的话,可以通过回填的方式,在知道具体的真实偏移量后再填入预设位置。

if条件语句为例:

if(true){
  10;
} else {
  20;
}

30;

当编译为字节码时,先编码条件语句部分,然后要有字节操作码来决定是进入哪个分支,新增两个操作码:

OpJumpNotTruthy,
OpJump,

前一个是有条件的跳转指令,如果它已经是true了则没有必要再去判断后面的else部分字节码了,使用OpJump这个无条件跳转指令。

每次语句执行后位于栈顶的元素会出栈,那么这里条件语句执行完后在调用栈就没有它,但它必须交给OpJumpNotTruthy有条件指令来作为条件进行判断,因此需要记住前一个出栈元素是什么:

struct EmittedInstruction{
    bytecode::OpcodeType Opcode;
    int Position;

    EmittedInstruction(){}
    EmittedInstruction(bytecode::OpcodeType op, int pos): Opcode(op), Position(pos){}
};

struct Compiler
{
    EmittedInstruction lastInstruction;
    EmittedInstruction prevInstruction;

    //...
};

添加一些辅助函数来记录这两个信息:

int emit(bytecode::OpcodeType op, std::vector<int> operands)
{
    auto ins = bytecode::Make(op, operands);
    auto pos = addInstruction(ins);

    setLastInstruction(op, pos);        
    return pos;
}

void setLastInstruction(bytecode::OpcodeType op, int pos)
{
    prevInstruction = lastInstruction;
    lastInstruction = EmittedInstruction(op, pos);
}

bool lastInstructionIsPop()
{
    return (lastInstruction.Opcode == bytecode::OpcodeType::OpPop);
}

void removeLastPop()
{
    instructions.assign(instructions.begin(), instructions.begin() + lastInstruction.Position);
    lastInstruction = prevInstruction;
}

每个刚出栈的元素都可以记录和判断,则可以编译期去除一些出栈操作,起到回退的效果。

至于跳转指令的真实偏移量,在编译到可以确定这个值时回填过去:

void replaceInstruction(int pos, const bytecode::Instructions& newInstruction)
{
    for (int i = 0, size = newInstruction.size(); i < size; i++)
    {
        instructions[pos + i] = newInstruction[i];
    }
}

void changeOperand(int opPos, int operand)
{
    bytecode::OpcodeType op = static_cast<bytecode::OpcodeType>(instructions[opPos]);
    auto newInstruction = bytecode::Make(op, {operand});

    replaceInstruction(opPos, newInstruction);
}

当知道具体的偏移量后,构建新的字节码,然后直接替换预设偏移量的位置,反正字节长度是一样的,等同直接覆盖也不需要担心引起乱码。

if-else表达式

else if(node->GetNodeType() == ast::NodeType::IfExpression)
 {
     std::shared_ptr<ast::IfExpression> ifObj = std::dynamic_pointer_cast<ast::IfExpression>(node);

     auto resultObj = Compile(ifObj->pCondition);
     if (evaluator::isError(resultObj))
     {
         return resultObj;
     }

     // 预设一个偏移量方便后续回填
     auto jumpNotTruthyPos =  emit(bytecode::OpcodeType::OpJumpNotTruthy, {9999});

     resultObj = Compile(ifObj->pConsequence); // 会引入一个OpPop
     if (evaluator::isError(resultObj))
     {
         return resultObj;
     }

     // 清除这个多余的OpPop
     if(lastInstructionIsPop())
     {
         removeLastPop();
     }

     // 如果没有else部分,则可以回填真实偏移量,跳转到此次即条件语句结束
     if(ifObj->pAlternative == nullptr)
     {
         auto afterConsequencePos = instructions.size();
         changeOperand(jumpNotTruthyPos, afterConsequencePos);
     }
     else
     {
         // 无条件跳转指令
         // 前面条件为真时需要跳过else部分所有字节码
         auto jumpPos = emit(bytecode::OpcodeType::OpJump, {9999});

         auto afterConsequencePos = instructions.size();
         changeOperand(jumpNotTruthyPos, afterConsequencePos);

         resultObj = Compile(ifObj->pAlternative);
         if (evaluator::isError(resultObj))
         {
             return resultObj;
         }

         if (lastInstructionIsPop())
         {
             removeLastPop();
         }

         afterConsequencePos = instructions.size();
         changeOperand(jumpPos, afterConsequencePos);
     }
 }

这里的OpJump无条件跳转指令是必须的,当条件为真时执行完如果不跳转,则会继续解码else部分。

执行跳转指令

case bytecode::OpcodeType::OpJump:
    {
        uint16_t pos;
        bytecode::ReadUint16(instructions, ip+1, pos);
        ip = pos - 1;
    }
    break;
case bytecode::OpcodeType::OpJumpNotTruthy:
    {
        uint16_t pos;
        bytecode::ReadUint16(instructions, ip+1, pos);
        ip += 2;

        auto condition = Pop();
        if(!evaluator::isTruthy(condition))
        {
            ip = pos - 1;
        }
    }
    break;

直接读取跳转指令偏移量,根据条件设置下一个读取字节码位置即可。

OpNull操作码

在if-else语句中,如果条件为false也没有else部分,则字节码如何安排呢?为了处理这个问题,总是认为存在一个else部分,如果不存在就用一个Null代替,以保证语法的完整性。

else if(node->GetNodeType() == ast::NodeType::IfExpression)
{
    std::shared_ptr<ast::IfExpression> ifObj = std::dynamic_pointer_cast<ast::IfExpression>(node);

    auto resultObj = Compile(ifObj->pCondition);
    if (evaluator::isError(resultObj))
    {
        return resultObj;
    }

    // 预设一个偏移量方便后续回填
    auto jumpNotTruthyPos =  emit(bytecode::OpcodeType::OpJumpNotTruthy, {9999});

    resultObj = Compile(ifObj->pConsequence);
    if (evaluator::isError(resultObj))
    {
        return resultObj;
    }

    if(lastInstructionIsPop())
    {
        removeLastPop();
    }

    auto jumpPos = emit(bytecode::OpcodeType::OpJump, {9999});

    auto afterConsequencePos = instructions.size();
    changeOperand(jumpNotTruthyPos, afterConsequencePos);

    if(ifObj->pAlternative == nullptr)
    {
        emit(bytecode::OpcodeType::OpNull, {});
    }
    else
    {
        resultObj = Compile(ifObj->pAlternative);
        if (evaluator::isError(resultObj))
        {
            return resultObj;
        }

        if (lastInstructionIsPop())
        {
            removeLastPop();
        }
    }

    afterConsequencePos = instructions.size();
    changeOperand(jumpPos, afterConsequencePos);
}

虚拟机:

case bytecode::OpcodeType::OpNull:
    auto result = Push(objects::NULL_OBJ);
    if(evaluator::isError(result))
    {
        return result;
    }
    break;

测试用例

全部通过测试用例:

$ ./test_monkey
[==========] Running 49 tests from 49 test suites.
[----------] Global test environment set-up.
[----------] 1 test from TestNextToken
[ RUN      ] TestNextToken.BasicAssertions
[       OK ] TestNextToken.BasicAssertions (0 ms)
[----------] 1 test from TestNextToken (0 ms total)

......

[----------] 1 test from testVMBooleanExpression
[ RUN      ] testVMBooleanExpression.basicTest
[       OK ] testVMBooleanExpression.basicTest (0 ms)
[----------] 1 test from testVMBooleanExpression (0 ms total)

[----------] 1 test from testVMConditionals
[ RUN      ] testVMConditionals.basicTest
[       OK ] testVMConditionals.basicTest (0 ms)
[----------] 1 test from testVMConditionals (0 ms total)

[----------] Global test environment tear-down
[==========] 49 tests from 49 test suites ran. (16 ms total)
[  PASSED  ] 49 tests.

总结

通过跳转指令实现跳转,对于具体跳转的偏移量通过预设位置和回填真实值的方式进行,执行跳转指令时只是改变下一个解码位置。