proj-plbook-plChSingleIntermedLangs2

Table of Contents for Programming Languages: a survey


continued from [1]

Other intermediate target languages designed for implementing a single high-level language, part 2

C

http://stackoverflow.com/questions/6839737/are-there-any-c-to-bytecode-compilers

Cint, a C/C++ interpreter

VM instructions (from cint-5.18.00/cint/src/bc_inst.h https://github.com/dawehner/root/blob/master/cint/cint/src/bc_inst.h ). In some cases i've written comments on what i think these instructions might mean, these are only guesses from skimming the source code very quickly, as i couldn't find any documentation (although some instructions had code comments, which i often copied into here):

  void LD(G__value* pval);
  void LD(int a);            # push a copy of the value found at depth a in the data stack onto the data stack
  void CL(void);             # clear stack pointer; Take a possible breakpoint, and then clear the data stack, and the structure offset stack.
  void OP2(int opr);         # one of +,-,*,/,%,@,>>,<<,&,| (applied to the data stack)
  int CNDJMP(int addr=0);    # if TOS == 0, then jump to specified address
  int JMP(int addr=0);
  void POP(void);
  void LD_FUNC(const char* fname,int hash,int paran,void* pfunc,   # fname is the string name of a fn; without a longer look, i can't tell if this pushes a reference to a partially applied function onto the data stack, or if it actually calls the function
               struct G__ifunc_table_internal* ifunc, int ifn);
  void LD_FUNC_BC(struct G__ifunc_table* ifunc,int ifn,int paran,void *pfunc);
  void LD_FUNC_VIRTUAL(struct G__ifunc_table* ifunc,int ifn,int paran,void *pfunc);
  void RETURN(void);                                        # half
  void CAST(int type,int tagnum,int typenum,int reftype);
  void CAST(G__TypeInfo& x);
  void OP1(int opr);              # one of +,-
  void LETVVAL(void);             # something about assigning and lvalues?
  void ADDSTROS(int os);          # ? adds 'os' (offset) to a variable named G__store_struct_offset; dunno what this is for
  void LETPVAL(void);		  # ? i dunno, some kind of assignment? looks at the top two(?) items on the stack and pops the top one
  void TOPNTR(void);              # calls G__val2pointer on the TOS
  void NOT(void);                 # applies NOT to TOS
  void BOOL(void);                # i think this coerces the TOS to bool
  int ISDEFAULTPARA(int addr=0);  # if the stack is not empty, then JMP to addr, otherwise continue
  void LD_VAR(struct G__var_array* var,int ig15,int paran,int var_type);  # i dont understand this at all, but i think this is about loading a variable, which may be an array, into TOS? It first consumes a quantity of 'paran' parameters from the stack, but afaict it then ignores these? Then in the call to G__getvariable, the first parameter, which looks like it is supposed to be a variable name, is always the empty string, which looks like it should cause G_getvariable, in https://github.com/dawehner/root/blob/ed4cde23329c21e940754a56a7195ed082273229/cint/tool/ifdef/get.c , to return the empty string? So i'm confused.
  void ST_VAR(struct G__var_array* var,int ig15,int paran,int var_type);  # 
  void LD_MSTR(struct G__var_array* var,int ig15,int paran,int var_type);
  void ST_MSTR(struct G__var_array* var,int ig15,int paran,int var_type);
  void LD_LVAR(struct G__var_array* var,int ig15,int paran,int var_type);
  void ST_LVAR(struct G__var_array* var,int ig15,int paran,int var_type);
  void CMP2(int operator2);
  void PUSHSTROS(void);
  void SETSTROS(void);
  void POPSTROS(void);
  void SETTEMP(void);
  void FREETEMP(void);
  void GETRSVD(const char* item);
  void REWINDSTACK(int rewind);
  int CND1JMP(int addr=0);
 private:
  void LD_IFUNC(struct G__ifunc_table* p_ifunc,int ifn,int hash,int paran,int funcmatch,int memfunc_flag);
 public:
  void NEWALLOC(int size,int isclass_array);
  void SET_NEWALLOC(int tagnum,int var_type);
  void SET_NEWALLOC(const G__TypeInfo& type);
  void DELETEFREE(int isarray);
  void SWAP();
  void BASECONV(int formal_tagnum,int baseoffset);
  void STORETEMP(void);
  void ALLOCTEMP(int tagnum);
  void POPTEMP(int tagnum);
  void REORDER(int paran,int ig25);
  void LD_THIS(int var_type);
  void RTN_FUNC(int isreturn);
  void SETMEMFUNCENV(void);
  void RECMEMFUNCENV(void);
  void ADDALLOCTABLE(void);
  void DELALLOCTABLE(void);
  void BASEDESTRUCT(int tagnum,int isarray);
  void REDECL(struct G__var_array* var,int ig15);
  void TOVALUE(G__value* pbuf);
  void INIT_REF(struct G__var_array* var,int ig15,int paran,int var_type);
  void PUSHCPY(void);
  void LETNEWVAL(void);
  void SETGVP(int pushpop);
  void TOPVALUE(void);
  void CTOR_SETGVP(struct G__var_array* var,int ig15,int mode); 
  int TRY(int first_catchblock=0,int endof_catchblock=0);
  void TYPEMATCH(G__value* pbuf);
  void ALLOCEXCEPTION(int tagnum);
  void DESTROYEXCEPTION(void);
  void THROW(void);
  void CATCH(void);
  void SETARYINDEX(int newauto);
  void RESETARYINDEX(int newauto);
  void GETARYINDEX(void);
  void PAUSE();
  void NOP(void);
  // new instructions
  void ENTERSCOPE(void);
  void EXITSCOPE(void);
  void PUTAUTOOBJ(struct G__var_array* var,int ig15);
  void CASE(void* x);
  /* void SETARYCTOR(int num); */
  void MEMCPY();
  void MEMSETINT(int mode,map<long,long>& x);
  int JMPIFVIRTUALOBJ(int offset,int addr=0);
  void VIRTUALADDSTROS(int tagnum,struct G__inheritance* baseclass,int basen);
  void cancel_VIRTUALADDSTROS();

see also https://github.com/dawehner/root/blob/master/cint/cint/src/bc_exec_asm.h for some code that executes (some of?) these (and other things generated by the optimizer?)

CIL

" CIL (for C Intermediate Language) sits in between GENERIC and GIMPLE: control structures are kept and side-effects are hoisted out of expression trees by introducing instructions. It is originally the intermediate language used in safe compiler CCured. While GENERIC is best suited for source code syntactic analyses and GIMPLE for optimization, CIL is ideally suited for source code semantic analyses. As such, it is the frontend of many analyses for C programs, ranging from type-safe compilation to symbolic evaluation and slicing. " -- Yannick Moy. Automatic Modular Static Safety Checking for C Programs. PhD thesis, Université Paris-Sud, January 2009, section 2.1, page 42

"CIL is both lower-level than abstract-syntax trees, by clarifying ambiguous constructs and removing redundant ones, and also higher-level than typical intermediate languages designed for compilation, by maintaining types and a close relationship with the source program. The main advantage of CIL is that it compiles all valid C programs into a few core constructs with a very clean semantics...CIL features a reduced number of syntactic and conceptual forms. For example, all looping constructs are reduced to a single form, all function bodies are given explicit return statements, syntactic sugar like "->" is eliminated and function arguments with array types become pointers. (For an extensive list of how CIL simplifies C programs, see Section 4.) This reduces the number of cases that must be considered when manipulating a C program. CIL also separates type declarations from code and flattens scopes within function bodies. This structures the program in a manner more amenable to rapid analysis and transformation. CIL computes the types of all program expressions, and makes all type promotions and casts explicit. CIL supports all GCC and MSVC extensions except for nested functions and complex numbers. Finally, CIL organizes C’s imperative features into expressions, instructions and statements based on the presence and absence of side-effects and control-flow. Every statement can be annotated with successor and predecessor information. Thus CIL provides an integrated program representation that can be used with routines that require an AST (e.g. type-based analyses and pretty-printers), as well as with routines that require a CFG (e.g., dataflow analyses). CIL also supports even lower-level representations (e.g., three-address code), see Section 8." [2]

Links:

Ray Toal's "Case Study: An IR for C"

" C is such a simple language, an IR is fairly easy to design. Why is C so "simple"?

    Only base types are several varieties of ints and floats
    No true arrays — e1[e2] just abbreviates *(e1+e2); all indicies start at zero, there's no bounds checking
    All parameters passed by value, and in order
    Block structure is very restricted: all functions on same level (no nesting); variables are either truly global or local to a top-level function; implementation drastically simplified because no static links are ever needed and functions can be easily passed as parameters without scoping trouble.
    Structure copy is bitwise
    Arrays aren't copied element by element
    Language is small; nearly everything interesting (like I/O) is in a library 

Thus, the following tuples are sufficient:

        x <- y                        x <- y[i]
        x <- &y                       x[i] <- y
        x <- *y                       goto L
        *x <- y                       if x relop y goto L
        x <- unaryop y                param x
        x <- y binaryop z             call p, n
        unaryop is one of: +, -, !, ~, ...
        binaryop is one of: +, -, *, /, %, &, |, ^, ., &&, ||, ...
        relop is one of ==, !=, <, <=, >, >=
        x[i] means i memory location x + i
        call p,n means call p with n bytes of arguments" -- [3]

Newspeak

"Newspeak [96] is at the same level as MSIL, while maintaining some control structures and expression trees. These design decisions notably facilitate source code analyses. It is the intermediate language used in static analyzer Penjili." -- Yannick Moy. Automatic Modular Static Safety Checking for C Programs. PhD thesis, Université Paris-Sud, January 2009, section 2.1, page 42

Links:

CompCert (Clight and Cminor)

"The CompCert? project [22] for building a verified compiler presents a chain of intermediate languages for compilation of C programs, from Clight, a large subset of C, to plain assembly, through Cminor, which is similar to MSIL and Newspeak" -- Yannick Moy. Automatic Modular Static Safety Checking for C Programs. PhD thesis, Université Paris-Sud, January 2009, section 2.1, page 42

Links:

SIMPL

"C0 is a Pascal-like subset of C, similar to MISRA-C. It excludes the features of C that make full verification problematic: pointer arithmetic, unions, pointer casts. It is the intermediate language used in the Verisoft project for pervasive formal verification of computer systems.

Simpl is a very generic Sequential IMperative Programming Language. It offers high-level constructs like closures, pointers to procedures, dynamic method invocation, all above an abstract state that can be instantiated differently depending on the language analyzed. In his PhD? thesis, Schirmer presents a two-fold embedding of a programming language in HOL theorem prover through Simpl. On the one hand, Simpl statements are deeply embedded inside HOL, which allows one to formally verify correctness of the anal- ysis on statements. On the other hand, the abstract state is shallowly embedded inside HOL, for an easier application to program analysis of many different languages. Simpl is used as target language in the C0 compiler of the Verisoft project." -- Yannick Moy. Automatic Modular Static Safety Checking for C Programs. PhD thesis, Université Paris-Sud, January 2009, section 2.1, page 43


Ruva VM (Ruby VM)

http://ruva.rubyforge.org/classes/Ruva/VM/Opcodes.html


Guile 2.1

Guile Tree IL

http://www.gnu.org/software/guile/docs/master/guile.html/Tree_002dIL.html#Tree_002dIL

a selection, quoted and paraphrased from http://www.gnu.org/software/guile/docs/master/guile.html/Tree_002dIL.html#Tree_002dIL

Tree Intermediate Language (Tree-IL) is a structured intermediate language that is close in expressive power to Scheme. It is an expanded, pre-analyzed Scheme.

Tree-IL is a convenient compilation target from source languages. It can be convenient as a medium for optimization, though CPS is usually better. The strength of Tree-IL is that it does not fix order of evaluation, so it makes some code motion a bit easier.

Tree-IL is “structured” in the sense that its representation is based on records, not S-expressions. This ... ensures that compiling to a lower-level language only requires a limited set of transformations. For example, the Tree-IL type <const> is a record type with two fields, src and exp. Instances of this type are created via make-const. Fields of this type are accessed via the const-src and const-exp procedures. There is also a predicate, const?. See Records, for more information on records.

All Tree-IL types have a src slot, which holds source location information for the expression.

Although Tree-IL objects are represented internally using records, there is also an equivalent S-expression external representation for each kind of Tree-IL. For example, the S-expression representation of #<const src: #f exp: 3> expression would be:

(const 3)

Users may program with this format directly at the REPL:

scheme@(guile-user)> ,language tree-il
Happy hacking with Tree Intermediate Language!  To switch back, type `,L scheme'.
tree-il@(guile-user)> (call (primitive +) (const 32) (const 10))
⇒ 42

One may create Tree-IL objects from their external representations via calling parse-tree-il, the reader for Tree-IL.

from http://www.gnu.org/software/guile/docs/master/guile.html/The-Scheme-Compiler.html#The-Scheme-Compiler "The job of the Scheme compiler is to expand all macros and all of Scheme to its most primitive expressions. The definition of “primitive expression” is given by the inventory of constructs provided by Tree-IL, the target language of the Scheme compiler: procedure calls, conditionals, lexical references, and so on. This is described more fully in the next section.

The tricky and amusing thing about the Scheme-to-Tree-IL compiler is that it is completely implemented by the macro expander. Since the macro expander has to run over all of the source code already in order to expand macros, it might as well do the analysis at the same time, producing Tree-IL expressions directly. "

todo:

(void): An empty expression. In practice, equivalent to Scheme’s (if #f #f). 
(const exp): A constant. 
(primitive name): A reference to a “primitive”.
(lexical name gensym): A reference to a lexically-bound variable. The name is the original name of the variable in the source program. gensym is a unique identifier for this variable.
(set! (lexical name gensym) exp): Sets a lexically-bound variable. 
( @ module name): A reference to a variable in a specific module.
(set! (@ mod name) exp): Sets a variable in a specific module. 
(toplevel name): References a variable from the current procedure’s module. 
(set! (toplevel name) exp): Sets a variable in the current procedure’s module. 
(define (toplevel name) exp): Defines a new top-level variable in the current procedure’s module. 
(if test then else): A conditional. Note that else is not optional. 
(call proc . args): A procedure call. 
(primcall name . args): A call to a primitive. Equivalent to (call (primitive name) . args). This construct is often more convenient to generate and analyze than <call>. As part of the compilation process, instances of (call (primitive name) . args) are transformed into primcalls. 
(seq head tail): A sequence. The semantics is that head is evaluated first, and any resulting values are ignored. Then tail is evaluated, in tail position. 
(lambda meta body): A closure. meta is an association list of properties for the procedure. body is a single Tree-IL expression of type <lambda-case>. As the <lambda-case> clause can chain to an alternate clause, this makes Tree-IL’s <lambda> have the expressiveness of Scheme’s case-lambda. 
(lambda-case ((req opt rest kw inits gensyms) body) [alternate]): One clause of a case-lambda. A lambda expression in Scheme is treated as a case-lambda with one clause. req is a list of the procedure’s required arguments, as symbols. opt is a list of the optional arguments, or #f if there are no optional arguments. rest is the name of the rest argument, or #f. kw is a list of the form, (allow-other-keys? (keyword name var) ...), where keyword is the keyword corresponding to the argument named name, and whose corresponding gensym is var. inits are tree-il expressions corresponding to all of the optional and keyword arguments, evaluated to bind variables whose value is not supplied by the procedure caller. Each init expression is evaluated in the lexical context of previously bound variables, from left to right. gensyms is a list of gensyms corresponding to all arguments: first all of the required arguments, then the optional arguments if any, then the rest argument if any, then all of the keyword arguments. body is the body of the clause. If the procedure is called with an appropriate number of arguments, body is evaluated in tail position. Otherwise, if there is an alternate, it should be a <lambda-case> expression, representing the next clause to try. If there is no alternate, a wrong-number-of-arguments error is signaled. 
(let names gensyms vals exp): Lexical binding, like Scheme’s let. names are the original binding names, gensyms are gensyms corresponding to the names, and vals are Tree-IL expressions for the values. exp is a single Tree-IL expression. 
(letrec names gensyms vals exp): A version of <let> that creates recursive bindings, like Scheme’s letrec
(prompt escape-only? tag body handler): A dynamic prompt. Instates a prompt named tag, an expression, during the dynamic extent of the execution of body, also an expression. If an abort occurs to this prompt, control will be passed to handler, also an expression, which should be a procedure. The first argument to the handler procedure will be the captured continuation, followed by all of the values passed to the abort. If escape-only? is true, the handler should be a <lambda> with a single <lambda-case> body expression with no optional or keyword arguments, and no alternate, and whose first argument is unreferenced. See Prompts, for more information. 
(abort tag args tail): An abort to the nearest prompt with the name tag, an expression. args should be a list of expressions to pass to the prompt’s handler, and tail should be an expression that will evaluate to a list of additional arguments. An abort will save the partial continuation, which may later be reinstated, resulting in the <abort> expression evaluating to some number of values. 

There are two Tree-IL constructs that are not normally produced by higher-level compilers, but instead are generated during the source-to-source optimization and analysis passes that the Tree-IL compiler does. Users should not generate these expressions directly, unless they feel very clever, as the default analysis pass will generate them as necessary.

(let-values names gensyms exp body): Like Scheme’s receive – binds the values returned by evaluating exp to the lambda-like bindings described by gensyms. That is to say, gensyms may be an improper list. <let-values> is an optimization of a <call> to the primitive, call-with-values. 
(fix names gensyms vals body): Like <letrec>, but only for vals that are unset lambda expressions. fix is an optimization of letrec (and let). 

See http://www.gnu.org/software/guile/docs/master/guile.html/Tree_002dIL.html#Tree_002dIL for some details of some optimizations that Guile performs on Guile Tree IL.

Guile CPS

Continuation-passing style (CPS) is Guile’s principal intermediate language, bridging the gap between languages for people and languages for machines. CPS gives a name to every part of a program: every control point, and every intermediate value. This makes it an excellent medium for reasoning about programs, which is the principal job of a compiler.

An Introduction to CPS

from http://www.gnu.org/software/guile/docs/master/guile.html/An-Introduction-to-CPS.html#An-Introduction-to-CPS

" Consider the following Scheme expression:

(begin
  (display "The sum of 32 and 10 is: ")
  (display 42)
  (newline))

Let us identify all of the sub-expressions in this expression, annotating them with unique labels:

(begin
  (display "The sum of 32 and 10 is: ")
  |k1      k2
  k0
  (display 42)
  |k4      k5
  k3
  (newline))
  |k7
  k6

Each of these labels identifies a point in a program. One label may be the continuation of another label. For example, the continuation of k7 is k6. This is because after evaluating the value of newline, performed by the expression labelled k7, we continue to apply it in k6.

Which expression has k0 as its continuation? It is either the expression labelled k1 or the expression labelled k2. Scheme does not have a fixed order of evaluation of arguments, though it does guarantee that they are evaluated in some order. Unlike general Scheme, continuation-passing style makes evaluation order explicit. In Guile, this choice is made by the higher-level language compilers.

Let us assume a left-to-right evaluation order. In that case the continuation of k1 is k2, and the continuation of k2 is k0.

With this example established, we are ready to give an example of CPS in Scheme:

(lambda (ktail)
  (let ((k1 (lambda ()
              (let ((k2 (lambda (proc)
                          (let ((k0 (lambda (arg0)
                                      (proc k4 arg0))))
                            (k0 "The sum of 32 and 10 is: ")))))
                (k2 display))))
        (k4 (lambda _
              (let ((k5 (lambda (proc)
                          (let ((k3 (lambda (arg0)
                                      (proc k7 arg0))))
                            (k3 42)))))
                (k5 display))))
        (k7 (lambda _
              (let ((k6 (lambda (proc)
                          (proc ktail))))
                (k6 newline)))))
    (k1))

Holy code explosion, Batman! What’s with all the lambdas? Indeed, CPS is by nature much more verbose than “direct-style” intermediate languages like Tree-IL. At the same time, CPS is simpler than full Scheme, because it makes things more explicit.

In the original program, the expression labelled k0 is in effect context. Any values it returns are ignored. In Scheme, this fact is implicit. In CPS, we can see it explicitly by noting that its continuation, k4, takes any number of values and ignores them. Compare this to k2, which takes a single value; in this way we can say that k1 is in a “value” context. Likewise k6 is in tail context with respect to the expression as a whole, because its continuation is the tail continuation, ktail. CPS makes these details manifest, and gives them names. "

from http://www.gnu.org/software/guile/docs/master/guile.html/CPS-in-Guile.html#CPS-in-Guile

todo

Guile’s CPS language is composed of terms, expressions, and continuations.

Terms

A term can either evaluate an expression and pass the resulting values to some continuation, or it can declare local continuations and contain a sub-term in the scope of those continuations.

$continue k src exp: Evaluate the expression exp and pass the resulting values (if any) to the continuation labelled k. The source information associated with the expression may be found in src, which is either an alist as in source-properties or is #f if there is no associated source.

$letk conts body: Bind conts, a list of continuations ($cont instances), in the scope of the sub-term body. The continuations are mutually recursive.

Additionally, the early stages of CPS allow for a set of mutually recursive functions to be declared as a term. This $letrec type is like Tree-IL’s <fix>. The contification pass will attempt to transform the functions declared in a $letrec into local continuations. Any remaining functions are later lowered to $fun expressions. $letrec names syms funs body: Declare the mutually recursive set of functions denoted by names, syms, and funs within the sub-term body. names and syms are lists of symbols, and funs is a list of $fun values. syms are globally unique.

Expressions

Here is an inventory of the kinds of expressions in Guile’s CPS language. Recall that all expressions are wrapped in a $continue term which specifies their continuation.

$void: Continue with the unspecified value.

$const val: Continue with the constant value val.

$prim name: Continue with the procedure that implements the primitive operation named by name.

$fun src meta free body: Continue with a procedure. src identifies the source information for the procedure declaration, and meta is the metadata alist as described above in Tree-IL’s <lambda>. free is a list of free variables accessed by the procedure. Early CPS uses an empty list for free; only after closure conversion is it correctly populated. Finally, body is the $kentry $cont of the procedure entry.

$call proc args: Call proc with the arguments args, and pass all values to the continuation. proc and the elements of the args list should all be variable names. The continuation identified by the term’s k should be a $kreceive or a $ktail instance.

$callk label proc args: $callk is for the case where the call target is known to be in the same compilation unit. label should be some continuation label, though it need not be in scope. In this case the proc is simply an additional argument, since it is not used to determine the call target at run-time.

$primcall name args: Perform the primitive operation identified by name, a well-known symbol, passing it the arguments args, and pass all resulting values to the continuation. The set of available primitives includes all primitives known to Tree-IL and then some more; see the source code for details.

$values args: Pass the values named by the list args to the continuation.

$prompt escape? tag handler: Push a prompt on the stack identified by the variable name tag, which may be escape-only if escape? is true, and continue with zero values. If the body aborts to this prompt, control will proceed at the continuation labelled handler, which should be a $kreceive continuation. Prompts are later popped by pop-prompt primcalls.

Continuations

The remaining element of the CPS language in Guile is the continuation. In CPS, all continuations have unique labels. Since this aspect is common to all continuation types, all continuations are contained in a $cont instance:

CPS $cont k cont: Declare a continuation labelled k. All references to the continuation will use this label.

$kargs names syms body: The most common kind of continuation binds some number of values, and then evaluates a sub-term. $kargs is this kind of simple lambda. Bind the incoming values to the variables syms, with original names names, and then evaluate the sub-term body.

Variable names (the names in the syms of a $kargs) should be globally unique, and also disjoint from continuation labels. To bind a value to a variable and then evaluate some term, you would continue with the value to a $kargs that declares one variable. The bound value would then be available for use within the body of the $kargs.

$kif kt kf: Receive one value. If it is true for the purposes of Scheme, branch to the continuation labelled kt, passing no values; otherwise, branch to kf. For internal reasons, only certain terms may continue to a $kif. Compiling $kif avoids allocating space for the test variable, so it needs to be preceded by expressions that can test-and-branch without temporary values. In practice this condition is true for $primcalls to null?, =, and similar primitives that have corresponding br-if-foo VM operations; see the source code for full details. When in doubt, bind the test expression to a variable, and continue to the $kif with a $values expression. The optimizer should elide the $values if it is not needed. Calls out to other functions need to be wrapped in a $kreceive continuation in order to adapt the returned values to their uses in the calling function, if any.

$kreceive arity k: Receive values on the stack. Parse them according to arity, and then proceed with the parsed values to the $kargs continuation labelled k. As a limitation specific to $kreceive, arity may only contain required and rest arguments. $arity is a helper data structure used by $kreceive and also by $kclause, described below.

CPS Data: $arity req opt rest kw allow-other-keys?: A data type declaring an arity. req and opt are lists of source names of required and optional arguments, respectively. rest is either the source name of the rest variable, or #f if this arity does not accept additional values. kw is a list of the form ((keyword name var) ...), describing the keyword arguments. allow-other-keys? is true if other keyword arguments are allowed and false otherwise. Note that all of these names with the exception of the vars in the kw list are source names, not unique variable names.

Additionally, there are three specific kinds of continuations that can only be declared at function entries.

$kentry self tail clauses: Declare a function entry. self is a variable bound to the procedure being called, and which may be used for self-references. tail declares the $cont wrapping the $ktail for this function, corresponding to the function’s tail continuation. clauses is a list of $kclause $cont instances.

$ktail: A tail continuation.

$kclause arity cont: A clause of a function with a given arity. Applications of a function with a compatible set of actual arguments will continue to cont, a $kargs $cont instance representing the clause body.

See http://www.gnu.org/software/guile/docs/master/guile.html/Compiling-CPS.html#Compiling-CPS for some details of some optimizations that Guile performs on Guile CPS.

Guile 2.1 bytecode

http://www.gnu.org/software/guile/docs/master/guile.html/Instruction-Set.html#Instruction-Set

http://www.gnu.org/software/guile/docs/master/guile.html/Variables-and-the-VM.html#Variables-and-the-VM

http://www.gnu.org/software/guile/docs/master/guile.html/Instruction-Set.html#Instruction-Set

todo

implementation: http://git.savannah.gnu.org/gitweb/?p=guile.git;a=blob;f=libguile/vm-engine.c;h=4ae2aa7285aa28207f0bf0847c1f5f2bcc8f4df3;hb=c30edbbd5b6cfeeca6b8a66578107df17ec96a51#l430

Guile Links

See also Guile section of proj-plbook-plChLispLangs, proj-plbook-plChImplementationCaseStudies, proj-plbook-plChMiscIntermedLangs, proj-plbook-plChTargetsImpl.

WebKit JavaScriptCore

Old documentation of their bytecode, this may be out of date:

https://github.com/WebKit/webkit/blob/master/Websites/webkit.org/specs/squirrelfish-bytecode.html

Those docs may be out of date because they don't match this, but maybe i am misunderstanding and this is something else: https://github.com/WebKit/webkit/blob/master/Source/JavaScriptCore/bytecode/BytecodeList.json

LLInt:

https://wingolog.org/archives/2012/06/27/inside-javascriptcores-low-level-interpreter

SpiderMonkey Javascript bytecode

Statements:

Variables and scopes:

operators:

literals:

other:

OCaml

OCaml has both a compiler and a bytecode interpreter.

The bytecode interpreter is based on an earlier CAML project/version called ZINC.

OCaml Lambda Form IL

https://realworldocaml.org/v1/en/html/the-compiler-backend-byte-code-and-native-code.html

ZINC abstract machine (ZAM) (1990)

Two stacks, the argument stack and the return stack.

todo

Links:

ZINC abstract machine (ZAM) with cache (1990)

Links:

ZINC CAML AST (1990)

type expression = mutable Zident of expr_ident (* variable "x" or "module.y" *)

Zconstant of struct_constant (* constant "3.14" *)
Zconstruct of constr_desc global & expression list (* constructor application *)
Zapply of expression & expression list (* multiple application *)
Zlet of (pattern & expression) list & expression (* local binding *)
Zletrec of (pattern & expression) list & expression (* local recursive binding *)
Zfunction of (pattern list & expression) list (* functions (with pattern matching) *)
Ztrywith of expression & (pattern & expression) list (* exception handling *)
Zsequence of expression & expression (* sequence *)
Zcondition of expression & expression & expression (* if...then...else... *)
Zwhile of expression & expression (* "while" loop *)
Zsequand of expression & expression (* sequential "and" *)
Zsequor of expression & expression (* sequential "or" *)
Zconstraint of expression & type_syntax (* type constraint *)
Zassign of string & expression (* assignment *)

and expr_ident = Zglobal of value_desc global (* global variable, with its descriptor *)

Zlocal of string (* local variable *)

type pattern = Zwildpat (* underscore "_" *)

Zvarpat of string (* variable *)
Zaliaspat of pattern & string (* alias "... as y" *)
Zconstantpat of atomic_constant (* constant *)
Zconstructpat of constr_desc global & pattern list (* construction *)
Zorpat of pattern & pattern (* alternative *)
Zconstraintpat of pattern & type_syntax (* type constraint *)

type atomic_constant = ACnum of num

and struct_constant = SCatom of atomic_constant
ACint of int
ACfloat of float
ACstring of string
SCblock of constr_tag & struct_constant list

Links:

ZINC CAML Enriched lambda-calculus (1990)

type lambda = Lvar of num (* local variable (de Bruijn’s number) *)

Lconst of struct_constant (* constants *)
Lapply of lambda & lambda list (* multiple application *)
Lfunction of num & lambda (* curried function (num = arity) *)
Llet of lambda list & lambda (* local binding *)
Lletrec of lambda list & lambda (* local recursive binding *)
Lprim of primitive & lambda list (* primitive operation *)
Lcond of lambda & (atomic_constant & lambda) list (* equality tests with constants *)
Lswitch of num & lambda & (constr_tag & lambda) list (* jump through table *)
Lstaticfail (* as "raise failure", but with static scope *)
Lstatichandle of lambda & lambda (* handler for static failures *)
Lhandle of lambda & lambda (* handler for "real" exceptions (dynamic scope) *)
Lifthenelse of lambda & lambda & lambda (* conditional *)
Lsequence of lambda & lambda (* sequence *)
Lwhile of lambda & lambda (* "while" loop *)
Lsequand of lambda & lambda (* sequential "and" *)
Lsequor of lambda & lambda (* sequential "or" *)

and primitive = Pget_global of string & string (* access to global variables *)

Pset_global of string & string (* modification of global variables *)
Pmakeblock of constr_tag (* building a tuple in the heap *)
Pfield of num (* accessing one field of a tuple *)
Psetfield of num (* modifying one field of a tuple *)
Pccall of num (* calling a C primitive *)
Praise (* raising an exception *)
Ptest of bool_test (* comparisons *)
... (* arithmetic operations, and much more *)

and bool_test = Pequal_test

Pnotequal_test
Peq_test
Pnoteq_test
Pnum_test of num prim_test
Pint_test of int prim_test
Pfloat_test of float prim_test
Pstring_test of string prim_test
Peqtag_test of constr_tag
Pnoteqtag_test of constr_tag

and ’a prim_test = PTeq

PTnoteq
PTnoteqimm of ’a
PTlt
PTle
PTgt
PTge

Links:

ZINC CAML Linear code (1990)

Links:

ZINC CAML bytecode (1990)

Constants and literals:

Function handling:

Environment handling:

Building and destructuring blocks:

Integers:

Floating-point numbers:

Strings:

Predicates:

Eq, Equal:

Branches and conditional branches:

Miscellaneous:

Links:

Caml bytecode (Zinc bytecode) 2010

version 3.11.2

the following is quoted or paraphrased from http://cadmium.x9c.fr/distrib/caml-instructions.pdf

" The virtual machine executing a caml bytecode is stack-centered. The caml stack, augmented with an accumulator, stores values. A caml value can be either:

Along with the stack, the virtual machine contains seven registers:

Stack and accumulator operations:

Arithmetic:

Conditionals and local branches:

Other control flow:

Memory and constant and variable loads/stores and allocations:

Composite data:

Types:

Misc:

Tests:

Objects:

Debugger:

Redundancies for efficiency (push then something else, or constants instead of parameters):

Links:

AliceML

AliceML core

Copied or paraphrased from https://synrc.com/publications/cat/Functional%20Languages/AliceML/christian_mueller_diplom.pdf :

"Alice Abstract Code"

data types with examples:

'restricted value set': integers, tuples, closures, tagged values and exceptions

See Guido Tack’s Diploma thesis (Linearisation, minimisation and transformation of data graphs with transients. Diploma thesis, Programming Systems Lab, Universit ̈at des Saarlan-des, Saarbr ̈ucken, May 2003) for the representation format of these data types in the SEAM Abstract Store (graph store).

Code is stored both as Alice Abstract Code (so that it can be pickled), and also as compiled code (either compiled to bytecode or to native code). Compiled code is represented as binary 'chunk' nodes in the SEAM Abstract Store; since these are opaque to SEAM, the garbage collector can't tell which other nodes they reference, therefore binary code is wrapped by a pair node which contains (points to) both a binary chunk node and also to an "immediate environment" which is a node holding references to other nodes that are referred to by the binary code. Binary code is position-independent in anticipation of being moved around by the garbage collector.

" Alice represents code as a directed acyclic graph. The acyclic structure is achieved by transforming all loops into recursion. The nodes of the graph encode the instructions of the abstract code. Except for the root node, each node has either one or two predecessors. A node has n ≥ 0 successors. The code graph explicitly encodes the control flow, which means that each instruction knows about its continuations.

The static Alice compiler removes all information about types. Static type safety guaranties that the abstract code instructions operate on correctly typed arguments. In particular, there are no polymorphic instructions, which means that the instructions do not have to test the type of its arguments. "

AliceML Abstract Code Instructions

Copied or paraphrased from https://synrc.com/publications/cat/Functional%20Languages/AliceML/christian_mueller_diplom.pdf :

identifiers:

The abstract code explicitly differentiates between defining and applied occurrences of identifiers. Let id be the set of identifiers and value denote the set of all Alice values. Defining occurrences of identifiers can have the following form: def ::= IdDef? {id : id}

Wildcard

There is either a local definition of an identifier or a wildcard. A wildcard is only a hint that the result of the definition is not used.

There are three forms of applied occurrences: ref ::= Global {id : id}

Immediate Global {value : value}Local {id : id}

Each abstract code instruction is parameterized over ref and def, respectively. This yields a compact definition of the instructions. However, execution speed suffers because each identifier access is preceded by a case distinction. This is a first hint that the primary design focus of the abstract code is not execution speed but compact definition and ease of (just-in-time) compilation.

join points:

A node in the abstract code graph is a join point if it has two predecessors. The abstract code explicitly encodes join points as special nod es. So by definition, all other nodes (except the root node) have exactly one predecessor. The Shared instruction is just a marker for a join point. It contains a stamp that distinguishes it from other join points in the code graph.

allocation of data structures:

creation plus initialization

The style is the same for all values: first the destination is given, then several data elements for initialization are provided, and the last component next identifies the successor instruction.

access to data structures:

conditionals:

Both instructions: 1. extract the tag t from the test value. 2. look up the tag in a vector of alternatives. 3. The corresponding entry in the vector contains a list of local identifiers and a successor instruction. The local identifiers are bound to the fields of the tagged value, and then we jump to the successor instruction.

The general test and the compact test only differ in the second step. A linear search on the vector is performed to find an entry with tag t. If there is no corresponding entry, the 'else' branch is taken.

Compact tests are used if the test cases cover a contiguous range of tags from 0 to k. Then, the branches can be stored compactly in a vector, such that their position defines the associated integer tag. After a range check 0<=t<=k, the tag t serves as an index into the vector, yielding constant time lookup. The range check can be removed if the test is exhaustive, which means that the test table covers all possible cases. The static Alice compiler can find out about exhaustive tests on the basis of type information. For all exhaustive tests, the 'else' branch is set to NONE.

procedure application:

exceptions:

join point wrapper:

primitives:

Encoded as procedure calls (using Apply and ApplyTail?) to a library of built-in procedures (incuding, for example, arithmetic, an instruction to initiate concurrent computation, string ops). For speed, the bytecode offers primitives for some of these, and the native compiler inlines some of them.

Links:

AliceML bytecode

Copied or paraphrased from https://synrc.com/publications/cat/Functional%20Languages/AliceML/christian_mueller_diplom.pdf :

"Alice byte code can be roughly described as a linearizion and specialization of the abstract code."

Identifiers:

Allocation:

Data Structure Access:

All instructions that require the actual value of an argument contain an internal test on transient values. If the test finds out that the argument is not yet determined, the interpreter interrupts execution and issues a request to the scheduler. There is, however, a single situation in which we need an explicit transient test: pattern matching on the empty tuple, i.e. the unit value.

Waiting on a potential transient:

Local control flow and conditionals:

Procedure Application:

Exceptions:

Calling convention conversion:

There must be some way to load procedure arguments out of the g lobal register bank into byte code registers. Due to the calling convention of Al ice (Section 2.2.3), the number of arguments that are in the register bank does not nec essarily coincide with the receiver’s expectation. So in some cases, the receiver h as to convert the arguments before loading them into byte code registers. The two CCC instructions in the byte code perform the conversion internally and then load the arg uments into the dedicated registers.

Inlined primitives:

Other bytecodes for efficiency: "A concise set of byte code instructions is nice for presentation and it eases compiler implementation. However, for many instructions, there are special cases that occur quite often. Adding specialized instructions to the instru ction set noticeably increases execution speed. Alice offers tuples of fixed arity. In practice, most of the tup les are pairs and triples. Therefore, it pays off to introduce specialized byte code ins tructions for construction and deconstruction of pairs and triples. Other examples for specializations are call instructions. The number of arguments rarely exceeds three , such that performance benefits from specialized call instructions for argument nu mbers from zero 2 to three ar- guments. There are also specialized arithmetic instructio ns, for example, an increment and a decrement instruction for integer arguments. The sema ntics of Alice prescribes overflow and underflow checks for all arithmetic operations. The general instruction to add two integers checks on the result whether it falls below t he smallest representable number or exceeds the biggest representable number. The inc rement instruction only needs to check for overflow, and the decrement instruction on ly needs to check for underflow."

"Super-instructions combine several virtual machine instr uctions into one, thus reduc- ing code size as well as dispatch and argument access overhea d. The literature distin- guishes two ways of implementing super-instructions, name ly a static and a dynamic approach [19]. In the static approach the interpreter write r identifies common in- struction sequences and adds a new implementation for each s uper-instruction. In the dynamic approach super-instructions are created at run-ti me. The Alice byte code system uses static super-instructions. There is, for instance, an instruction that combines pair construction and initializ ation."

Links:


Ruby YARV VM bytecode

Instruction list:

Other lists of those instructions and related (possibly out of date):

Instruction list:


Berkeley Packet Filter (BPF) bytecode (Linux variants)

BPF is a VM language used for packet filtering. There is a later variant "eBPF" (or "internal BPF"). https://www.kernel.org/doc/Documentation/networking/filter.txt describes the bytecodes for both of these, starting with section "BPF engine and instruction set".

BPF has two special registers A and X, and 16 memory locations (or GPRs (general purpose registers)). They are 32-bit width.

Instructions are 64-bits. A 16-bit opcode, one 32-bit operand, and two special 8-bit jump targets ("one for condition "jump if true", the other one "jump if false""). There are 10 or so addressing modes (different subsets of them are applicable to different instructions) (where are the addressing modes encoded? i'm not sure). The addressing modes include constant, A, X, the GPRs, and various addressing modes to address locations within the current packet (remember, this is a language for packet filtering).

eBPF "is modelled closer to the underlying architecture to mimic native instruction sets..It is designed to be JITed with one to one mapping, which can also open up the possibility for GCC/LLVM compilers to generate optimized eBPF code through an eBPF backend that performs almost as fast as natively compiled code."

eBPF has 10 64-bit registers instead of 2 32-bit (A and X), plus a read-only frame pointer register. It also contains a CALL instruction and a calling convention. The convention is: "

Note that only functions that take 5 or less arguments are supported by eBPF CALL. This was chosen as a lowest-common-denominator after a survey of calling conventions associated with popular 64-bit architectures: "Natively, x86_64 passes first 6 arguments in registers, aarch64/ sparcv9/mips64 have 7 - 8 registers for arguments; x86_64 has 6 callee saved registers, and aarch64/sparcv9/mips64 have 11 or more callee saved registers."

eBPF has "conditional jt/jf targets replaced with jt/fall-through".

eBPF is limited to 4096 instructions (however this limit can be subverted by use of BPF_MAP_TYPE_PROG_ARRAY), and in the past, can only jump forward (no loops; i'm not sure if original non-e BPF had this restriction). Now, "the only jumps which are allowed to go backwards are the end of a loop with a fixed number of iterations" [4]; specifically, there is a limit of 32 nested calls.

eBPF has some new instructions:

eBPF has 'maps' which are key-value associative arrays/dictionaries that support the following operations:

eBPF does some verification to disallow:

" The original Berkeley Packet Filter (BPF) [PDF] was designed for capturing and filtering network packets that matched specific rules. Filters are implemented as programs to be run on a register-based virtual machine.

The ability to run user-supplied programs inside of the kernel proved to be a useful design decision but other aspects of the original BPF design didn't hold up so well. For one, the design of the virtual machine and its instruction set architecture (ISA) were left behind as modern processors moved to 64-bit registers and invented new instructions required for multiprocessor systems, like the atomic exchange-and-add instruction (XADD). BPF's focus on providing a small number of RISC instructions no longer matched the realities of modern processors.

So, Alexei Starovoitov introduced the extended BPF (eBPF) design to take advantage of advances in modern hardware. The eBPF virtual machine more closely resembles contemporary processors, allowing eBPF instructions to be mapped more closely to the hardware ISA for improved performance. One of the most notable changes was a move to 64-bit registers and an increase in the number of registers from two to ten. Since modern architectures have far more than two registers, this allows parameters to be passed to functions in eBPF virtual machine registers, just like on native hardware. Plus, a new BPF_CALL instruction made it possible to call in-kernel functions cheaply. "

" The 3.15 kernel sees another significant change in BPF. The language has now been split into two variants, "classic BPF" and "internal BPF". The latter expands the set of available registers from two to ten, adds a number of instructions that closely match real hardware instructions, implements 64-bit registers, makes it possible for BPF programs to call a (rigidly controlled) set of kernel functions, and more. Internal BPF is more readily compiled into fast machine code and makes it easier to hook BPF into other subsystems. "

Variants

bpftrace

Link

---

Rust MIR

Note: Rust MIR is still in development, yet i wrote the following in present tense.

Rust compiles Rust -> HLL AST -> MIR -> LLVM.

Rust's MIR is after type checking. Types are explicit in MIR. Borrow checking is done on MIR. Basic blocks are explicit in MIR (MIR is a control-flow graph (CFG)). In Mir, control flow is translated to GOTO and IF and SWITCH and function calling (i think) (and a function call can specify an panic handler). Monomorphization is done after MIR (on LLVM).

---

Self

An Efficient Implementation of SELF, a Dynamically-Typed Object-Oriented Language Based on Prototypes.

" Each bytecode in the byte array represents a single byte-sized virtual machine instruction,and is divided into two parts: a 3-bit opcode and a 5-bit object array index...The only opcodes used to represent SELFprograms are the following:

SELF push self onto the execution stack LITERAL <value index> push a literal value onto the execution stack SEND <message name index> send a message, popping the receiver and arguments off the execution stack and pushing the result SELF SEND <message name index> send a message to self, popping the arguments off the execution stack and pushing the result SUPER SEND <message name index> send a message to self, delegated to all parents, popping the arguments off the execution stack and pushing the result DELEGATEE <parent name index> delegate the next message send to the named parent NON-LOCAL RETURN execute a non-local return from the lexically-enclosing method activation INDEX-EXTENSION <index extension> extend the next index by prepending the index extension

The index for the opcodes is an index into the accompanying object array. The 5-bit offset allows the first 32 message names and literals to be referred to directly;indices larger than 32 are constructed using extra INDEX-EXTENSION instruc-tions. In SELF source code, primitive operations are invoked with the same syntax usedto send a message, except that the message name begins with an underscore (“_”). " -- [5]

"The byte codes needed to express SELF programs fall into only three classes: base values (LITERAL and SELF), message sends, and non-local return." ((and index extension, which is encoding))

"Smalltalk-80 defines a much larger set of byte codes [12], tuned to minimize space and maximize interpretation speed, and includes byte codes to fetch and store local, instance, class, pool, and global vari- ables and shortcut byte codes for common-case operations such as loading constants like nil, true, and 0. Smalltalk-80 systems also use special control flow byte codes to implement common boolean messages like ifTrue:ifFalse: and whileTrue:; the Small- talk parser translates the message sends into conditional and unconditional branch byte codes, open-coding the argument blocks. Similarly, the == message is auto- matically translated into the identity comparison primitive operation byte code. A similar optimization is included for messages like + and <, which the parser trans- lates into special byte codes. When executed, these byte codes either directly invoke the corresponding integer primitive operation (if the receiver is an integer), or perform the message send (if the receiver isn’t an integer). "

"Each byte code in the byte array represents a single byte-sized virtual machine instruction, and is divided into two parts: a 3-bit opcode and a 5-bit object array index. " eg for self.x, the five bits are the index of method 'x' within object 'self'.


Scheme 79 chip S-code

Primitives

from the Appendix (things that start with 'primitive-'):

---

Fantom fcode

Fantom compiles to JVM and CLR. 'fcode' is its IR.

Fantom fcode operation argument types

Fantom fcode instructions

FalseTrueIntFloatDecimalStrDurationTypeUri): load literal onto stack (some are immediate, some are LOADK eg load a constant from a constant table)
Store)Var (Register): takes a local var register index (0 is this)
Store)Instance (FieldRef?): load/store field from/to storage
Store)Static (FieldRef?): load/store static field from/to storage
NE) (TypePair?): a.equals(b)
LTGTGE) (TypePair?): a.compare(b) <= 0, etc

Links:

---

Haxe AST

Haxe is a language that compiles to many target platforms.

The Haxe AST is exposed to Haxe Macros [6].

types of AST nodes (from ExprDef?):

other stuff in that file:

Links:

PreScheme

"Scheme 48...The name was intended to reflect the idea that the implementation should be so clear and simple that you could believe the system could have been written in 48 hours (or perhaps, when S48 got complex enough, this was altered to "could have been *read* in 48 hours")....Its innovations were its module system, the language in which its VM was defined ("pre-scheme"), and its stack-management technology. ... Pre-scheme was quite interesting. Kelsey published a paper on it, as well, I believe. It was Scheme in the sense that you could load it into a Scheme system and run the code. But it was restrictive -- it required you to write in a fashion that allowed complete Hindley-Milner static type inference, and all higher-order procedures were beta-substituted away at compile time, meaning you could *straightforwardly* translate a prescheme program into "natural" C code with C-level effiency. That is, you could view prescheme as a really pleasant alternative to C for low-level code. And you could debug your prescheme programs in the interactive Scheme development environment of your choice, before flipping a switch and translating to C code, because prescheme was just a restricted Scheme. The Scheme 48 byte-code interpreter was written in prescheme. Prescheme sort of died -- beyond the academic paper he wrote, Kelsey never quite had the time to document it and turn it into a standalone tool that other people could use (Ian Horswill's group at Northwestern is an exception to that claim -- they have used prescheme for interesting work). There are ideas there to be had, however. " -- [7]

prescheme appears to be implemented in http://www.s48.org/cgi-bin/hgwebdir.cgi/s48-stable/file/c0f7ae288483/scheme/prescheme

From the Wikipedia page: " Variants

Macro-Free PreScheme? Obtained from Full PreScheme? by expanding all macros. Pure PreScheme? A tail-recursive, closure-free dialect of PreScheme?, obtained from Macro-Free PreScheme? by hoisting lambda expressions and beta expansion. VLISP PreScheme? Simple PreScheme? "

Links:

---

BCPL O-code

https://en.m.wikipedia.org/wiki/O-code

The BCPL compiler was the first to define a virtual machine for the purpose of portability [8]. The language of the VM is called OCODE.

Links:

BCPL INTCODE

Later, a lower-level assembly-like virtual machine was added beneath OCode, called INTCODE.

" Unlike a conventional computer, the INTCODE machine is not fully specified, and such details as the word-length, byte-size, and instruction format are left undefined. The machine has 6 control registers as follows: A and B are accumulators for computing expressions, C is the sequence control register giving the location of the next instruction to be obeyed, D is a register used to hold the computed address of the current instruction, and P and G are index registers. All these registers are the size of a machine word.

 An instruction has a 3 bit function field, and an address
 field of unspecified size, 2 bits for index modification and an
 indirection bit.  These fields may be laid out in the word in
 any way that is convenient for the interpreter.  An instruction
 is executed as follows.  Firstly, it is fetched from the store
 and C is incremented, then, the computed address is formed by
 assigning the address field to D, conditionally adding P or G
 as specified by the modification field, and indirecting if
 required.  Finally, the operation specified by the function
 field is performed.
 The 8 machine functions are: LOAD, ADD, STORE, JUMP,
 JUMP ON TRUE, JUMP OF FALSE, CALL, and EXECUTE OPERATION...
 CALL is used in the compilation of a BCPL
 function or routine call. ...EXECUTE OPERATION provides a miscellaneous collection of arithmetic,
 relational, logical, and control functions, the actual function
 being determined by D. Most of the functions operate on B and
 A, usually leaving a result in A.  For example, X7 will cause the
 remainder after the integer division of B by A to be assigned to
 A.  There are 23 execute operations in the basic INTCODE machine,
 but for practical use, a further 5 to 10 are needed in order to
 provide an adequate interface with the operating system.
 " -- [http://www.gtoal.com/languages/bcpl/amiga/bcpl/booting.txt]

Later on, presumably INTCODE was refined into CINTCODE, which is much more elaborate (see page 155 (PDF page 163) of [9]).

---

Featherweight Erlang

Links:

CIL (C Intermediate Language)

https://people.eecs.berkeley.edu/~necula/Papers/cil_cc02.pdf

" exp::=Constant(const)

Lvalue(lvalue)SizeOfExp?(exp)SizeOfType?(type)AlignOfExp?(exp)AlignOfType?(type)UnOp?(unop,exp)BinOp?(binop,exp,exp)Cast(type,exp)AddressOf?(lvalue)StartOf?(lvalue)instr::=Set(lvalue,exp)Call(lvalue option,exp,exp list)Asm(raw strings,lvalue list,exp list)

Fig. 4.The syntax of CIL expressions and instructions "

" stmt::=Instr(instr list)

"
Return(exp option)Goto(stmt)BreakContinueIf(exp,stmt list,stmt list)Switch(exp,stmt list,stmt list)Loop(stmt list)Fig. 5.The syntax of CIL statements.

" type::=Void

"
Int(intKind)Float(floatKind)Ptr(type)Array(type,exp)Fun(type,variable list)Enum(enumInfo)Named(string,type)Struct(compInfo)Union(compInfo)enumInfo::= (string,item list)compInfo::= (string,field list)Fig. 6.The abstract syntax of CIL types

mruby bytecode

---

Zig IR

https://github.com/ziglang/zig/blob/master/src/ir.zig

    add
    alloc
    arg
    assembly
    bitcast
    block
    br
    breakpoint
    brvoid
    call
    cmp_lt
    cmp_lte
    cmp_eq
    cmp_gte
    cmp_gt
    cmp_neq
    condbr
    constant
    dbg_stmt
    isnonnull
    isnull
    iserr 
    load   /// Read a value from a pointer
    loop
    ptrtoint
    ref
    ret
    retvoid
    varptr 
    store  /// Write a value to a pointer. LHS is pointer, RHS is value
    sub
    unreach
    not
    floatcast
    intcast
    unwrap_optional
    wrap_optional

V8's Ignition

https://v8.dev/docs/ignition https://medium.com/dailyjs/understanding-v8s-bytecode-317d46c94775

"V8 is a multi-tier VM. Its first tier is called Ignition, it is a bytecode stack machine with an accumulator register. V8 starts by compiling the code to Ignition bytecodes." [10]

Ruby Rhizome bytecode

" These are the instructions in the Rhizome bytecode format:


C4 bytecode

C4 is a small compiler for a C subset

https://github.com/rswier/c4/blob/master/c4.c

" opcodes enum { LEA ,IMM ,JMP ,JSR ,BZ ,BNZ ,ENT ,ADJ ,LEV ,LI ,LC ,SI ,SC ,PSH , OR ,XOR ,AND ,EQ ,NE ,LT ,GT ,LE ,GE ,SHL ,SHR ,ADD ,SUB ,MUL ,DIV ,MOD , OPEN,READ,CLOS,PRTF,MALC,FREE,MSET,MCMP,EXIT };

types enum { CHAR, INT, PTR };

...

    if      (i == LEA) a = (int)(bp + *pc++);                             // load local address
    else if (i == IMM) a = *pc++;                                         // load global address or immediate
    else if (i == JMP) pc = (int *)*pc;                                   // jump
    else if (i == JSR) { *--sp = (int)(pc + 1); pc = (int *)*pc; }        // jump to subroutine
    else if (i == BZ)  pc = a ? pc + 1 : (int *)*pc;                      // branch if zero
    else if (i == BNZ) pc = a ? (int *)*pc : pc + 1;                      // branch if not zero
    else if (i == ENT) { *--sp = (int)bp; bp = sp; sp = sp - *pc++; }     // enter subroutine
    else if (i == ADJ) sp = sp + *pc++;                                   // stack adjust
    else if (i == LEV) { sp = bp; bp = (int *)*sp++; pc = (int *)*sp++; } // leave subroutine
    else if (i == LI)  a = *(int *)a;                                     // load int
    else if (i == LC)  a = *(char *)a;                                    // load char
    else if (i == SI)  *(int *)*sp++ = a;                                 // store int
    else if (i == SC)  a = *(char *)*sp++ = a;                            // store char
    else if (i == PSH) *--sp = a;                                         // push

...

    else if (i == OPEN) a = open((char *)sp[1], *sp);
    else if (i == READ) a = read(sp[2], (char *)sp[1], *sp);
    else if (i == CLOS) a = close(*sp);
    else if (i == PRTF) { t = sp + pc[1]; a = printf((char *)t[-1], t[-2], t[-3], t[-4], t[-5], t[-6]); }
    else if (i == MALC) a = (int)malloc(*sp);
    else if (i == FREE) free((void *)*sp);
    else if (i == MSET) a = (int)memset((char *)sp[2], sp[1], *sp);
    else if (i == MCMP) a = memcmp((char *)sp[2], (char *)sp[1], *sp);
    else if (i == EXIT) { printf("exit(%d) cycle = %d\n", *sp, cycle); return *sp; }
    "

---

Janet bytecode

https://github.com/janet-lang/janet/blob/master/src/core/asm.c https://github.com/janet-lang/janet/blob/master/src/core/vm.c

---

APL Typed Array Intermediate Language

" Shapes (δ), primitive operators (op), values (v), and expressions (e) are defined as follows: δ ::= 〈~n〉 (shapes) a ::= i

op ::= addi v ::= [~a]δ e ::= v " -- [12]
d
subi muli mini maxi (operators)
addd subd muld mind maxd
iota each reduce i2d
reshape0reshape rotate
transp transp2zipWith
shape take drop first
cat cons snoc (derived ops)
shapeSh catSh consSh snocSh (shape ops)
iotaSh rotateSh
takeSh dropSh firstSh
λx.e (values)
x [~e]e e′ (expressions)
let x = e1 in e2op(~e)

---

qb.js VM

qb.js: An implementation of QBASIC in Javascript

---