Search code examples
c++recursionassemblyhalting-problem

Verifying halting-problem on self-implemented pseudo-assembly


I wrote a very simple implementation of what could be a similarity to Assembly/machine code.

It is even capable of recursion as in this example:

9 6
IFEQ R0,0
RET 1
ENDIF
MOV R1,R0
SUB R1,1
CALL R1
MOV R2,R9
MUL R2,R0
RET R2

Output: 720 (factorial of 6)

Description:

9 = Program Lines
6 = Program Input. Will be set to registry R0 value at class construction
CALL = calls the program again with the passed value (recursion)
RET = returns the program with the specified value. Sets registry R9 value to output value.

R0 to R9 -> general purpose registry.
R0 - program input value
R9 - program output value

-edit: Program commands: MOV, ADD, SUB, MUL, DIV, MOD, IFEQ, IFNEQ, IFG, IFGE, IFL, IFLE, ENDIF, CALL, RET

However the program can enter into infinite loop/recursion. e.g:

2 0
CALL 10
RET 1 //will never be reached

How do I verify whether MY program will enter into an infinite loop/recursion?

Here's my implementation, don't know whether it's necessary, but just in case you need. (It's the whole code... hope you don't mind).

#include <iostream>
#include <map>
#include <string> //std::getline
#include <sstream>
#include <vector>

namespace util
{
    template<typename I>I readcin(I& input) {
        std::cin >> input;
        std::cin.clear(); std::cin.ignore();
        return input;
    }
    template<typename I, typename...O> I readcin(I& input, O&... others) {
        readcin(input);
        return readcin(others...);
    }
}

//operations
enum OP
{
    MOV, ADD, SUB, MUL, DIV, MOD,
    IFG, IFL,
    IFEQ, IFGE, IFLE,
    IFNEQ,
    CALL,
    RET,
    ENDIF,
};
std::map<std::string, OP> OPTABLE
{
    {"MOV", MOV}, {"ADD", ADD}, {"SUB", SUB}, {"MUL", MUL}, {"DIV", DIV}, {"MOD", MOD},
    {"RET", RET},
    {"IFG", IFG}, {"IFL", IFL},
    {"IFNEQ", IFNEQ}, {"IFEQ", IFEQ}, {"IFGE", IFGE}, {"IFLE", IFLE},
    {"CALL", CALL},
    {"ENDIF", ENDIF}
};
//registry index
enum RI
{
    R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, RI_MAX
};
std::map<std::string, RI> RITABLE =
{
    {"R0", R0}, {"R1", R1}, {"R2", R2}, {"R3", R3}, {"R4", R4}, {"R5", R5},
    {"R6", R6}, {"R7", R7}, {"R8", R8}, {"R9", R9}
};

struct Instruction
{
    OP op;
    RI r1;
    int r2value;
    Instruction() = delete;
    Instruction(OP operation, RI firstRegister, int _2ndRegValue = -1)
    {
        op = operation;
        r1 = firstRegister;
        r2value = _2ndRegValue;
    }
};


class Assembly
{

private:
    int REG[RI::RI_MAX] {0};
    int GetRegistryValue(RI ri) const { return REG[ri]; }
    void SetRegistryValue(RI ri, int val) { REG[ri] = val; }

    enum CMP_FLAG{ CMP_FAIL, CMP_OK };
    CMP_FLAG flag { CMP_OK };
    CMP_FLAG GetFlag() const { return flag; }
    void SetFlag(bool setFlag) { flag = static_cast<CMP_FLAG>(setFlag); }

    std::vector<std::string> programLines;

    OP ExtractOP(const std::string& line);
    RI ExtractRI(const std::string& line, OP op);
    int Extract2ndRIval(const std::string& line, OP op);
public:
    void addCommand(const std::string& line) { programLines.push_back(line); }
    void Execute();

    Assembly() = delete;
    Assembly(int inputValue) { REG[R0] = inputValue; }
    int ReturnValue() const { return REG[R9]; }
private:
    //recursion only
    Assembly(int inputValue, const std::vector<std::string>& progLines) {
        REG[R0] = inputValue;
        programLines = progLines;
        this->Execute();
    }
};

OP Assembly::ExtractOP(const std::string& line)
{
    std::istringstream issline{ line };
    std::string operation;
    issline >> operation;

    return OPTABLE[operation];
}

RI Assembly::ExtractRI(const std::string& line, OP op)
{
    auto space = line.find(' ');
    if(op <= IFNEQ){
        auto comma = line.find(',');
        return RITABLE[std::string(line.begin() + space + 1, line.begin() + comma)];
    }
    return RI_MAX;
}

int Assembly::Extract2ndRIval(const std::string& line, OP op)
{
    if(op == ENDIF) {
        return -1;
    }

    std::size_t spaceOrComma;
    if(op == CALL || op == RET) {
        spaceOrComma = line.find(' ');
    } else {
        spaceOrComma = line.find(',');
    }

    std::string opval = std::string(line.begin() + spaceOrComma + 1, line.end());
    auto it = RITABLE.find(opval);
    if(it != RITABLE.end()){
        return this->REG[it->second];
    }
    auto num = std::atoi(opval.c_str());
    return num;
}

void Assembly::Execute()
{
    for(const std::string& line : programLines)
    {
        OP op = ExtractOP(line);
        RI r1 = ExtractRI(line, op);
        int r2value = Extract2ndRIval(line, op);

        Instruction command ( op, r1, r2value );

        if(GetFlag() == CMP_FAIL)
        {
            if(command.op == ENDIF){
                SetFlag(CMP_OK);
            }
            continue;
        }

        switch(command.op)
        {
            case MOV: { SetRegistryValue(command.r1, command.r2value); } break;
            case ADD: { SetRegistryValue(command.r1, REG[command.r1] + command.r2value); } break;
            case SUB: { SetRegistryValue(command.r1, REG[command.r1] - command.r2value); } break;
            case MUL: { SetRegistryValue(command.r1, REG[command.r1] * command.r2value); } break;
            case DIV: { SetRegistryValue(command.r1, REG[command.r1] / command.r2value); } break;
            case MOD: { SetRegistryValue(command.r1, REG[command.r1] % command.r2value); } break;

            case IFEQ:  { SetFlag(GetRegistryValue(command.r1) == command.r2value); } break;
            case IFNEQ: { SetFlag(GetRegistryValue(command.r1) != command.r2value); } break;
            case IFG:   { SetFlag(GetRegistryValue(command.r1) > command.r2value); } break;
            case IFL:   { SetFlag(GetRegistryValue(command.r1) < command.r2value); } break;
            case IFGE:  { SetFlag(GetRegistryValue(command.r1) >= command.r2value); } break;
            case IFLE:  { SetFlag(GetRegistryValue(command.r1) <= command.r2value); } break;

            case RET:
            {
                SetRegistryValue(R9, command.r2value);
                return;
            }break;

            //oh boy!
            case CALL:
            {
               // std::cout << "value to call:" << command.r2value << std::endl;
                Assembly recursion(command.r2value, this->programLines);
                SetRegistryValue(R9, recursion.ReturnValue());
            }break;
        }
    }
}

int main()
{
    while(true)
    {
        int pl, input;
        util::readcin(pl, input);
        if(pl == 0){
            break;
        }

        Assembly Asm(input);
        for(auto i=0; i<pl; ++i)
        {
            std::string line;
            std::getline(std::cin, line);
            Asm.addCommand(line);
        }
        Asm.Execute();

        std::cout << Asm.ReturnValue() << std::endl;
    }
    return 0;
}

Solution

  • The only way to check to see if a program is stuck in an infinite loop in the general case is to check to see the program has entered the same state as previous state. If it has entered exactly the same state previously, then it must continue on executing in a loop returning to the same state over and over following the same sequence of steps. In real programs this essentially impossible because of the huge number of possible states the program can be in, but your assembly language only allows much more limited number of possible states.

    Since your CALL instruction works just like invoking the program at the start and this is the only form of looping, this means that checking if the code enters the same state twice is simple. A CALL instruction with a certain argument has the exact same effect as invoking the program with that argument as an input. If the CALL instruction has the same argument as any previously executed CALL instruction or the program's input value, then it must continue executing in a loop endlessly returning to same state in the same sequence of steps.

    In other words, the only state that needs to be checked is the R0 value at the start of the program. Since this value is stored in a int, it can only have 2^32 possible values on any common C++ implementation, so it's reasonable and easy to brute force check see if a given program with a given input gets stuck in infinite loop.

    In fact, it's possible to use memoization of the return values to brute force check all possible inputs in O(N) time and O(N) space, where N is number of possible inputs. There are various ways you could do this, but the way I would do it is to create three vectors, all with a number of elements equal to the number of possible inputs. The first vector is a bool (bit) vector that records whether or not a given input has been memoized yet, the second vector is also bool vector and it records whether a given input has been used already on the call stack, the second vector is an int vector that records the result and is used a linked list of input values to create a call stack to save space. (In the code below these vectors are called, is_memoized, input_pending and memoized_value respectively.)

    I'd take your interpreter loop and rewrite it to be non-recursive, something like this pseudo-code:

    input = reg[R0]
    if is_memoized[input]: 
        reg[R9] = memoized_value[input]
        return
    input_pending[input] = true
    memoized_value[input] = input  // mark the top of the stack
    
    while true:
        for command in program:
    
            ...
    
            if command.op == CALL:
                 argument = command.r2value
    
                 if input_pending[argument]: 
                     // Since this input value is ready been used as input value 
                     // somewhere on the call stack this the program is about to enter
                     // an identical state as a previous state and so is stuck in
                     // a infinite loop.
                     return false  // program doesn't halt
    
                 if is_memoized[argument]:
                     REG[R9] = memoized_value[argument]
                 else:
                     memoized_value[argument] = input   // stack the input value
    
                     input = argument
                     REG = [input, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                     input_pending[input] = true
                     break  // reevaluate the program from the beginning.
    
            if command.op == RET:
                  argument = command.r2value
                  stacked_input = memoized_value[input]
                  input_pending[input] = false
                  if stacked_input == input:  // at the top of the stack
                      REG[R9] = argument
                      return true   // program halts for this input
    
                  is_memoized[input] = true
                  memoized_value[input] = argument
                  input = stacked_input
                  REG = [input, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
                  break  // reevaluate the program from the beginning
    

    You'd then call this interpreter loop for each possible input, something like this:

    for i in all_possible_inputs:
         if not program.execute(input = i):  // the function written above
              print "program doesn't halt for all inputs"
              return
    print "program halts for all inputs"
    

    A recursive version should be faster since it doesn't have to reevaluate the program on each unmemoized CALL instruction in the program, but it would require hundreds of gigabytes of stack space in the worst case. This non-recursive version only requires 17 GB of memory. Either way it's still O(N) space and time, you're just making one constant factor smaller and another bigger.

    To get this execute in reasonable amount of time you'd also probably want to parse the code once, and execute some byte code representation instead.