我发现,在C的层面上实现pPy几乎是不可能了。
pPy,即polite-Python,这颗曾乘风飞翔了两天的蒲公英种子落在了一块名为C的板结的固土上,十个小时后死去了。与它一同被沙土埋葬的,还有它在云端舞蹈时绮丽的幻想,和它窥探到的来自天堂的能使人间和睦的福音。
自落地后,pPy的挣扎从LL(1) parser开始。
当摸着石头淌进编译原理的深潭时,我发现,python在3.9版本的PEP617中弃用了LL(1) parser,作为一个相对较新的版本,偌大的互联网竟没有几篇资料。新版的python改用了PEG parser,致使cpython源码中的Grammar文件也被弃用,连带更新了大量的编译器源码,仅有的几篇技术博客也早已过时。
新版的python语法规则放在Grammar/Python.gram
文件中,相应的token则放在Grammar/Tokens
里。Python运行时,python的语法会首先经过Parser/tokenizer.c
这个文件,被按照token文件的规则解构成一个个token,而后被送入Parser/parser.c
中进行翻译。parser会依据Parser/Python.asdl
这个文件中的语法将token们解析成AST然后继而变成可以被执行的code object。因此,在实际操作中,当尝试更改语法时,Python/Python-ast.c
, Python/ast.c
, 以及Python/symtable.c
中的代码也需要更改。好在,这些文件里的部分是可以自动生成的,但是依然有相当一部分需要手动更改。
最关键的parser.c
是不用改的,因为这个编译器是被pegen模块生成的,pegen有两个版本,一个是c的老版本,一个是python的新版本,在Tools/peg_generator/pegen
里。确实,python甚至可以在自己还没有被编译好的时候运行自己的python代码,好像是预先编译过了。在我编译cpython时,老版本的Parser/pegen.c
似乎没有被弃用,依然运行,执行的工作却和生成文件关系不大。有意思的是,pegen这个编译编译器的编译器甚至也是被一个编译器编译的,因为pegen是翻译Grammar/Python.gram
的,而这个语法的语法规则被是被元语法定义的。元语法在这里Tools/peg_generator/pegen/metagrammar.gram
, 不过这个应该很少会需要改动。
在cpython里,函数的调用关系和上述流程并不相同。如Parser/tokenizer.c
中的第1352到2064行有一个相当长的函数叫tok_get
,里面是具体python如何处理文本字符串的。这个函数最终会被Parser/pegen.c
里的_PyPegen_fill_token
函数调用,而这个函数则在生成的Parser/parser.c
中被大量调用。这么说来终究是有些模糊,那就从python运行的主函数开始吧。
/* Minimal main program -- everything is loaded from the library */
#include "Python.h"
#ifdef MS_WINDOWS
int
(int argc, wchar_t **argv)
wmain{
return Py_Main(argc, argv);
}
#else
int
(int argc, char **argv)
main{
return Py_BytesMain(argc, argv);
}
#endif
// 这里就是整个python的主函数所在了,一切的python的入口都在这里。
// 这个函数只是简单区分了一下windows和别的操作系统,我们进到main函数里面看一下。
int
(int argc, char **argv)
Py_BytesMain{
= {
_PyArgv args .argc = argc,
.use_bytes_argv = 1,
.bytes_argv = argv,
.wchar_argv = NULL};
return pymain_main(&args);
}
#ifdef __cplusplus
// 注意,从这里开始,文件就不再在Programs/main.c了,而是在Modules/main.c里面
// 没做什么事儿,接着往下走
static int
(_PyArgv *args)
pymain_main{
= pymain_init(args);
PyStatus status if (_PyStatus_IS_EXIT(status)) {
();
pymain_freereturn status.exitcode;
}
if (_PyStatus_EXCEPTION(status)) {
(status);
pymain_exit_error}
return Py_RunMain();
}
// 这个pymain_init函数是一个非常关键的函数,里面包含了所有python启动之前的初始化
// 里面结构非常复杂,并且牵涉到很多GIL的操作,介于我们的pPy并不需要关注这些内容,
// 在这里就进入Py_RunMain,不去关注初始化,直接开始运行
int
(void)
Py_RunMain{
int exitcode = 0;
(&exitcode);
pymain_run_python
if (Py_FinalizeEx() < 0) {
/* Value unlikely to be confused with a non-error exit status or
other special meaning */
= 120;
exitcode }
();
pymain_free
if (_Py_UnhandledKeyboardInterrupt) {
= exit_sigint();
exitcode }
return exitcode;
}
// 状态码,不重要
static void
(int *exitcode)
pymain_run_python{
*main_importer_path = NULL;
PyObject *interp = _PyInterpreterState_GET();
PyInterpreterState /* pymain_run_stdin() modify the config */
*config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
PyConfig
/* ensure path config is written into global variables */
if (_PyStatus_EXCEPTION(_PyPathConfig_UpdateGlobal(config))) {
goto error;
}
if (config->run_filename != NULL) {
/* If filename is a package (ex: directory or ZIP file) which contains
__main__.py, main_importer_path is set to filename and will be
prepended to sys.path.
Otherwise, main_importer_path is left unchanged. */
if (pymain_get_importer(config->run_filename, &main_importer_path,
)) {
exitcodereturn;
}
}
// import readline and rlcompleter before script dir is added to sys.path
(config);
pymain_import_readline
if (main_importer_path != NULL) {
if (pymain_sys_path_add_path0(interp, main_importer_path) < 0) {
goto error;
}
}
else if (!config->safe_path) {
*path0 = NULL;
PyObject int res = _PyPathConfig_ComputeSysPath0(&config->argv, &path0);
if (res < 0) {
goto error;
}
if (res > 0) {
if (pymain_sys_path_add_path0(interp, path0) < 0) {
(path0);
Py_DECREFgoto error;
}
(path0);
Py_DECREF}
}
(config);
pymain_header
if (config->run_command) {
*exitcode = pymain_run_command(config->run_command);
}
else if (config->run_module) {
*exitcode = pymain_run_module(config->run_module, 1);
}
else if (main_importer_path != NULL) {
*exitcode = pymain_run_module(L"__main__", 0);
}
else if (config->run_filename != NULL) {
*exitcode = pymain_run_file(config);
}
else {
*exitcode = pymain_run_stdin(config);
}
(config, exitcode);
pymain_replgoto done;
:
error*exitcode = pymain_exit_err_print();
:
done(main_importer_path);
Py_XDECREF}
// python可以通过各种不同的方式启动,在这个文件里做了规定,进到通过文件启动 pymain_run_file ,也就是我最常用的模式。
static int
(const PyConfig *config)
pymain_run_file{
*filename = PyUnicode_FromWideChar(config->run_filename, -1);
PyObject if (filename == NULL) {
();
PyErr_Printreturn -1;
}
*program_name = PyUnicode_FromWideChar(config->program_name, -1);
PyObject if (program_name == NULL) {
(filename);
Py_DECREF();
PyErr_Printreturn -1;
}
int res = pymain_run_file_obj(program_name, filename,
->skip_source_first_line);
config(filename);
Py_DECREF(program_name);
Py_DECREFreturn res;
}
// 读取文件名和文件信息,不重要
// 进入pymain_run_file_obj
static int
(PyObject *program_name, PyObject *filename,
pymain_run_file_objint skip_source_first_line)
{
if (PySys_Audit("cpython.run_file", "O", filename) < 0) {
return pymain_exit_err_print();
}
FILE *fp = _Py_fopen_obj(filename, "rb");
if (fp == NULL) {
// Ignore the OSError
();
PyErr_Clear("%S: can't open file %R: [Errno %d] %s\n",
PySys_FormatStderr, filename, errno, strerror(errno));
program_namereturn 2;
}
if (skip_source_first_line) {
int ch;
/* Push back first newline so line numbers remain the same */
while ((ch = getc(fp)) != EOF) {
if (ch == '\n') {
(void)ungetc(ch, fp);
break;
}
}
}
struct _Py_stat_struct sb;
if (_Py_fstat_noraise(fileno(fp), &sb) == 0 && S_ISDIR(sb.st_mode)) {
("%S: %R is a directory, cannot continue\n",
PySys_FormatStderr, filename);
program_name(fp);
fclosereturn 1;
}
// Call pending calls like signal handlers (SIGINT)
if (Py_MakePendingCalls() == -1) {
(fp);
fclosereturn pymain_exit_err_print();
}
/* PyRun_AnyFileExFlags(closeit=1) calls fclose(fp) before running code */
= _PyCompilerFlags_INIT;
PyCompilerFlags cf int run = _PyRun_AnyFileObject(fp, filename, 1, &cf);
return (run != 0);
}
// 终于,在嵌套了这么多层之后,文件被打开了,存在 FILE *fp 里面。 无法打开文件的异常也是在这个函数里抛出的。
// 直接进入到_PyRun_AnyFileObject
int
(FILE *fp, PyObject *filename, int closeit,
_PyRun_AnyFileObject*flags)
PyCompilerFlags {
int decref_filename = 0;
if (filename == NULL) {
= PyUnicode_FromString("???");
filename if (filename == NULL) {
();
PyErr_Printreturn -1;
}
= 1;
decref_filename }
int res;
if (_Py_FdIsInteractive(fp, filename)) {
= _PyRun_InteractiveLoopObject(fp, filename, flags);
res if (closeit) {
(fp);
fclose}
}
else {
= _PyRun_SimpleFileObject(fp, filename, closeit, flags);
res }
if (decref_filename) {
(filename);
Py_DECREF}
return res;
}
// 在这里可以看到python的运行又出现了分支,分别是_PyRun_InteractiveLoopObject, _PyRun_SimpleFileObject
// 从debug信息里推测前者应该是指python的命令行交互模式,后者指运行单个python文件。
// 因为后者牵涉到很多文件的配置信息,比较麻烦,作为展示就直接进入到_PyRun_InteractiveLoopObject里面。
int
(FILE *fp, PyObject *filename, PyCompilerFlags *flags)
_PyRun_InteractiveLoopObject{
= _PyCompilerFlags_INIT;
PyCompilerFlags local_flags if (flags == NULL) {
= &local_flags;
flags }
*tstate = _PyThreadState_GET();
PyThreadState *v = _PySys_GetAttr(tstate, &_Py_ID(ps1));
PyObject if (v == NULL) {
(&_Py_ID(ps1), v = PyUnicode_FromString(">>> "));
_PySys_SetAttr(v);
Py_XDECREF}
= _PySys_GetAttr(tstate, &_Py_ID(ps2));
v if (v == NULL) {
(&_Py_ID(ps2), v = PyUnicode_FromString("... "));
_PySys_SetAttr(v);
Py_XDECREF}
#ifdef Py_REF_DEBUG
int show_ref_count = _Py_GetConfig()->show_ref_count;
#endif
int err = 0;
int ret;
int nomem_count = 0;
do {
= PyRun_InteractiveOneObjectEx(fp, filename, flags);
ret if (ret == -1 && PyErr_Occurred()) {
/* Prevent an endless loop after multiple consecutive MemoryErrors
* while still allowing an interactive command to fail with a
* MemoryError. */
if (PyErr_ExceptionMatches(PyExc_MemoryError)) {
if (++nomem_count > 16) {
();
PyErr_Clear= -1;
err break;
}
} else {
= 0;
nomem_count }
();
PyErr_Print();
flush_io} else {
= 0;
nomem_count }
#ifdef Py_REF_DEBUG
if (show_ref_count) {
();
_PyDebug_PrintTotalRefs}
#endif
} while (ret != E_EOF);
return err;
}
// 可以看到,在这个函数里已经开始打印python交互模式的'>>>'了
// 这个函数里我们需要关注的是PyRun_InteractiveOneObjectEx,也就是具体python的每一句话都是怎么被执行的
/* A PyRun_InteractiveOneObject() auxiliary function that does not print the
* error on failure. */
static int
(FILE *fp, PyObject *filename,
PyRun_InteractiveOneObjectEx*flags)
PyCompilerFlags {
*m, *d, *v, *w, *oenc = NULL;
PyObject ;
mod_ty mod*arena;
PyArena const char *ps1 = "", *ps2 = "", *enc = NULL;
int errcode = 0;
*tstate = _PyThreadState_GET();
PyThreadState
if (fp == stdin) {
/* Fetch encoding from sys.stdin if possible. */
= _PySys_GetAttr(tstate, &_Py_ID(stdin));
v if (v && v != Py_None) {
= PyObject_GetAttr(v, &_Py_ID(encoding));
oenc if (oenc)
= PyUnicode_AsUTF8(oenc);
enc if (!enc)
();
PyErr_Clear}
}
= _PySys_GetAttr(tstate, &_Py_ID(ps1));
v if (v != NULL) {
= PyObject_Str(v);
v if (v == NULL)
();
PyErr_Clearelse if (PyUnicode_Check(v)) {
= PyUnicode_AsUTF8(v);
ps1 if (ps1 == NULL) {
();
PyErr_Clear= "";
ps1 }
}
}
= _PySys_GetAttr(tstate, &_Py_ID(ps2));
w if (w != NULL) {
= PyObject_Str(w);
w if (w == NULL)
();
PyErr_Clearelse if (PyUnicode_Check(w)) {
= PyUnicode_AsUTF8(w);
ps2 if (ps2 == NULL) {
();
PyErr_Clear= "";
ps2 }
}
}
= _PyArena_New();
arena if (arena == NULL) {
(v);
Py_XDECREF(w);
Py_XDECREF(oenc);
Py_XDECREFreturn -1;
}
= _PyParser_ASTFromFile(fp, filename, enc, Py_single_input,
mod , ps2, flags, &errcode, arena);
ps1
(v);
Py_XDECREF(w);
Py_XDECREF(oenc);
Py_XDECREFif (mod == NULL) {
(arena);
_PyArena_Freeif (errcode == E_EOF) {
();
PyErr_Clearreturn E_EOF;
}
return -1;
}
= PyImport_AddModuleObject(&_Py_ID(__main__));
m if (m == NULL) {
(arena);
_PyArena_Freereturn -1;
}
= PyModule_GetDict(m);
d = run_mod(mod, filename, d, d, flags, arena);
v (arena);
_PyArena_Freeif (v == NULL) {
return -1;
}
(v);
Py_DECREF();
flush_ioreturn 0;
}
// 前面的编码部分全部跳过不看,找到_PyParser_ASTFromFile这个函数
// 从名字就可以看出,这个函数已经进入到编译器的部分了,用于从文件获得AST
mod_ty(FILE *fp, PyObject *filename_ob, const char *enc,
_PyParser_ASTFromFileint mode, const char *ps1, const char* ps2,
*flags, int *errcode, PyArena *arena)
PyCompilerFlags {
if (PySys_Audit("compile", "OO", Py_None, filename_ob) < 0) {
return NULL;
}
return _PyPegen_run_parser_from_file_pointer(fp, mode, filename_ob, enc, ps1, ps2,
, errcode, arena);
flags}
// 进来以后发现是编译器的API文件,说明我们找的没错,接着往里走
mod_ty(FILE *fp, int start_rule, PyObject *filename_ob,
_PyPegen_run_parser_from_file_pointerconst char *enc, const char *ps1, const char *ps2,
*flags, int *errcode, PyArena *arena)
PyCompilerFlags {
struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
if (tok == NULL) {
if (PyErr_Occurred()) {
(filename_ob);
_PyPegen_raise_tokenizer_init_errorreturn NULL;
}
return NULL;
}
if (!tok->fp || ps1 != NULL || ps2 != NULL ||
(filename_ob, "<stdin>") == 0) {
PyUnicode_CompareWithASCIIString->fp_interactive = 1;
tok}
// This transfers the ownership to the tokenizer
->filename = filename_ob;
tok(filename_ob);
Py_INCREF
// From here on we need to clean up even if there's an error
= NULL;
mod_ty result
int parser_flags = compute_parser_flags(flags);
*p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
Parser , arena);
errcodeif (p == NULL) {
goto error;
}
= _PyPegen_run_parser(p);
result (p);
_PyPegen_Parser_Free
:
error(tok);
_PyTokenizer_Freereturn result;
}
// 到这一步,这个函数就已经存在于我们在文章一开始就提到的pegen.c之中了,
// 已经即将进入到我们所熟悉的领域了(放屁,一点都不熟)
// 这里略加说明,这个函数和上一个函数返回的 mod_ty 的定义如下
// struct _mod {
// enum _mod_kind kind;
// union {
// struct {
// asdl_stmt_seq *body;
// asdl_type_ignore_seq *type_ignores;
// } Module;
// struct {
// asdl_stmt_seq *body;
// } Interactive;
// struct {
// expr_ty body;
// } Expression;
// struct {
// asdl_expr_seq *argtypes;
// expr_ty returns;
// } FunctionType;
// } v;
// };
// 这个结构体是构建AST用的,里面的asdl_stmt_seq等结构全部都来自一个_stmt的结构体,它的定义如下
// 注意,这里你们会看到一个Thanks_kind,这个是原版python里面不存在的,是我为了测试pPy的时候增加的关键字,其作用和pass一样
// 可以看到,再stmt里面定义了python所有内置的关键字
// enum _stmt_kind {FunctionDef_kind=1, AsyncFunctionDef_kind=2, ClassDef_kind=3,
// Return_kind=4, Delete_kind=5, Assign_kind=6,
// AugAssign_kind=7, AnnAssign_kind=8, For_kind=9,
// AsyncFor_kind=10, While_kind=11, If_kind=12, With_kind=13,
// AsyncWith_kind=14, Match_kind=15, Raise_kind=16, Try_kind=17,
// TryStar_kind=18, Assert_kind=19, Import_kind=20,
// ImportFrom_kind=21, Global_kind=22, Nonlocal_kind=23,
// Expr_kind=24, Pass_kind=25, Thanks_kind=26, Break_kind=27,
// Continue_kind=28};
// struct _stmt {
// enum _stmt_kind kind;
// union {
// struct {
// identifier name;
// arguments_ty args;
// asdl_stmt_seq *body;
// asdl_expr_seq *decorator_list;
// expr_ty returns;
// string type_comment;
// } FunctionDef;
// struct {
// identifier name;
// arguments_ty args;
// asdl_stmt_seq *body;
// asdl_expr_seq *decorator_list;
// expr_ty returns;
// string type_comment;
// } AsyncFunctionDef;
// struct {
// identifier name;
// asdl_expr_seq *bases;
// asdl_keyword_seq *keywords;
// asdl_stmt_seq *body;
// asdl_expr_seq *decorator_list;
// } ClassDef;
// struct {
// expr_ty value;
// } Return;
// struct {
// asdl_expr_seq *targets;
// } Delete;
// struct {
// asdl_expr_seq *targets;
// expr_ty value;
// string type_comment;
// } Assign;
// struct {
// expr_ty target;
// operator_ty op;
// expr_ty value;
// } AugAssign;
// struct {
// expr_ty target;
// expr_ty annotation;
// expr_ty value;
// int simple;
// } AnnAssign;
// struct {
// expr_ty target;
// expr_ty iter;
// asdl_stmt_seq *body;
// asdl_stmt_seq *orelse;
// string type_comment;
// } For;
// struct {
// expr_ty target;
// expr_ty iter;
// asdl_stmt_seq *body;
// asdl_stmt_seq *orelse;
// string type_comment;
// } AsyncFor;
// struct {
// expr_ty test;
// asdl_stmt_seq *body;
// asdl_stmt_seq *orelse;
// } While;
// struct {
// expr_ty test;
// asdl_stmt_seq *body;
// asdl_stmt_seq *orelse;
// } If;
// struct {
// asdl_withitem_seq *items;
// asdl_stmt_seq *body;
// string type_comment;
// } With;
// struct {
// asdl_withitem_seq *items;
// asdl_stmt_seq *body;
// string type_comment;
// } AsyncWith;
// struct {
// expr_ty subject;
// asdl_match_case_seq *cases;
// } Match;
// struct {
// expr_ty exc;
// expr_ty cause;
// } Raise;
// struct {
// asdl_stmt_seq *body;
// asdl_excepthandler_seq *handlers;
// asdl_stmt_seq *orelse;
// asdl_stmt_seq *finalbody;
// } Try;
// struct {
// asdl_stmt_seq *body;
// asdl_excepthandler_seq *handlers;
// asdl_stmt_seq *orelse;
// asdl_stmt_seq *finalbody;
// } TryStar;
// struct {
// expr_ty test;
// expr_ty msg;
// } Assert;
// struct {
// asdl_alias_seq *names;
// } Import;
// struct {
// identifier module;
// asdl_alias_seq *names;
// int level;
// } ImportFrom;
// struct {
// asdl_identifier_seq *names;
// } Global;
// struct {
// asdl_identifier_seq *names;
// } Nonlocal;
// struct {
// expr_ty value;
// } Expr;
// } v;
// int lineno;
// int col_offset;
// int end_lineno;
// int end_col_offset;
// };
// 回到刚才的函数,注意它的返回值 result = _PyPegen_run_parser(p);
// 这说明就是这个 _PyPegen_run_parser 函数生成了已经被编译成 mod_ty 的python源码。
// 传进去的参数p是一个parser结构体,API也在pegen.c这个文件里,涉及到内存管理,
// 并且这个parser也拿了很多tokenize的函数进去,太复杂我直接跳过了。
void *
(Parser *p)
_PyPegen_run_parser{
void *res = _PyPegen_parse(p);
(p->level == 0);
assertif (res == NULL) {
if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) && _is_end_of_source(p)) {
();
PyErr_Clearreturn RAISE_SYNTAX_ERROR("incomplete input");
}
if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
return NULL;
}
// Make a second parser pass. In this pass we activate heavier and slower checks
// to produce better error messages and more complete diagnostics. Extra "invalid_*"
// rules will be active during parsing.
*last_token = p->tokens[p->fill - 1];
Token (p);
reset_parser_state_for_error_pass(p);
_PyPegen_parse
// Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
// point.
(p, last_token);
_Pypegen_set_syntax_errorreturn NULL;
}
if (p->start_rule == Py_single_input && bad_single_statement(p)) {
->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
preturn RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
}
// test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
#if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
if (p->start_rule == Py_single_input ||
->start_rule == Py_file_input ||
p->start_rule == Py_eval_input)
p{
if (!_PyAST_Validate(res)) {
return NULL;
}
}
#endif
return res;
}
// 进来以后似乎是一层wrapper,那就再接着往下
// 函数依然有大段内容在捕获异常,真正起编译作用的是_PyPegen_parse这个函数
void *
(Parser *p)
_PyPegen_parse{
// Initialize keywords
->keywords = reserved_keywords;
p->n_keyword_lists = n_keyword_lists;
p->soft_keywords = soft_keywords;
p
// Run parser
void *result = NULL;
if (p->start_rule == Py_file_input) {
= file_rule(p);
result } else if (p->start_rule == Py_single_input) {
= interactive_rule(p);
result } else if (p->start_rule == Py_eval_input) {
= eval_rule(p);
result } else if (p->start_rule == Py_func_type_input) {
= func_type_rule(p);
result } else if (p->start_rule == Py_fstring_input) {
= fstring_rule(p);
result }
return result;
}
// _PyPegen_parse这个函数就了不得了,它处在parser.c这个文件的第三万九千行的位置,非常恐怖。
// 当然,正如在文章开头提到的,这个巨大的parser.c文件也并不是python开发者们一个一个字写出来的,
// 而是被一个python文件根据语法文件生成出来的。文件在 Tools/peg_generator/pegen 里,有一大坨。
// 在C里面看了这么久,终于碰到python感动万分,立马进去看一下。
// 看了以后觉得有点逆天,截取一个方法展示一下
// def _set_up_rule_memoization(self, node: Rule, result_type: str) -> None:
// self.print("{")
// with self.indent():
// self.add_level()
// self.print(f"{result_type} _res = NULL;")
// self.print(f"if (_PyPegen_is_memoized(p, {node.name}_type, &_res)) {{")
// with self.indent():
// self.add_return("_res")
// self.print("}")
// self.print("int _mark = p->mark;")
// self.print("int _resmark = p->mark;")
// self.print("while (1) {")
// with self.indent():
// self.call_with_errorcheck_return(
// f"_PyPegen_update_memo(p, _mark, {node.name}_type, _res)", "_res"
// )
// self.print("p->mark = _mark;")
// self.print(f"void *_raw = {node.name}_raw(p);")
// self.print("if (p->error_indicator) {")
// with self.indent():
// self.add_return("NULL")
// self.print("}")
// self.print("if (_raw == NULL || p->mark <= _resmark)")
// with self.indent():
// self.print("break;")
// self.print(f"_resmark = p->mark;")
// self.print("_res = _raw;")
// self.print("}")
// self.print(f"p->mark = _resmark;")
// self.add_return("_res")
// self.print("}")
// self.print(f"static {result_type}")
// self.print(f"{node.name}_raw(Parser *p)")
// 觉得python的开发者们头还是很铁的,就字符串硬拼啊,dubug时候岂不是。。。
// 这个文件是python2更新到3的时候创建的,可见当时。。。。。
// (甚至生成方式和我这个博客的搭建如出一辙)
// 回到parser.c,这个方法里面根据输入的命令的不同,给出了不同的处理
// 这几个处理的子方法的结构都是差不多的,作为展示,这里依然还是不忘初心,
// 选择一个interactive_rule吧。
// interactive: statement_newline
static mod_ty
(Parser *p)
interactive_rule{
if (p->level++ == MAXSTACK) {
->error_indicator = 1;
p();
PyErr_NoMemory}
if (p->error_indicator) {
->level--;
preturn NULL;
}
= NULL;
mod_ty _res int _mark = p->mark;
{ // statement_newline
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> interactive[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "statement_newline"));
D* a;
asdl_stmt_seqif (
(a = statement_newline_rule(p)) // statement_newline
)
{
(fprintf(stderr, "%*c+ interactive[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "statement_newline"));
D= _PyAST_Interactive ( a , p -> arena );
_res if (_res == NULL && PyErr_Occurred()) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s interactive[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "statement_newline"));
p}
= NULL;
_res :
done->level--;
preturn _res;
}
// 其实,到这一步就发现,它依然还是一个wrapper,核心函数是 _PyAST_Interactive
// 它长这样:
// mod_ty
// _PyAST_Interactive(asdl_stmt_seq * body, PyArena *arena)
// {
// mod_ty p;
// p = (mod_ty)_PyArena_Malloc(arena, sizeof(*p));
// if (!p)
// return NULL;
// p->kind = Interactive_kind;
// p->v.Interactive.body = body;
// return p;
// }
// 分配内存,然后实例化一个mod_ty,所以这就需要我们回到mod_ty的实现上去
// 正如之前说的,mod_ty的实现依赖于_stmt, 而无论是mod_ty 还是 _stmt 都在 pycore-ast.h 里面,
// 而这个头文件是事先预写好的,并且没有涉及到编译的部分
// 在这里,源码的追踪变得困难了一些,
// 其实我们此刻更应该关注的是这个给到_PyAST_Interactive的第一个参数 body。
// 这个body才是包含了所有结构信息的参数。
// 所以,回到上面的函数里,body来自参数a,且有这么一行代码,
// a = statement_newline_rule(p),
// 所以应该去看一下statement_newline_rule这个函数。
// statement_newline: compound_stmt NEWLINE | simple_stmts | NEWLINE | $
static asdl_stmt_seq*
(Parser *p)
statement_newline_rule{
if (p->level++ == MAXSTACK) {
->error_indicator = 1;
p();
PyErr_NoMemory}
if (p->error_indicator) {
->level--;
preturn NULL;
}
* _res = NULL;
asdl_stmt_seqint _mark = p->mark;
if (p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
int _start_lineno = p->tokens[_mark]->lineno;
(_start_lineno); // Only used by EXTRA macro
UNUSEDint _start_col_offset = p->tokens[_mark]->col_offset;
(_start_col_offset); // Only used by EXTRA macro
UNUSED{ // compound_stmt NEWLINE
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> statement_newline[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "compound_stmt NEWLINE"));
D;
stmt_ty a* newline_var;
Token if (
(a = compound_stmt_rule(p)) // compound_stmt
&&
(newline_var = _PyPegen_expect_token(p, NEWLINE)) // token='NEWLINE'
)
{
(fprintf(stderr, "%*c+ statement_newline[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "compound_stmt NEWLINE"));
D= ( asdl_stmt_seq* ) _PyPegen_singleton_seq ( p , a );
_res if (_res == NULL && PyErr_Occurred()) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s statement_newline[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "compound_stmt NEWLINE"));
p}
{ // simple_stmts
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> statement_newline[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "simple_stmts"));
D* simple_stmts_var;
asdl_stmt_seqif (
(simple_stmts_var = simple_stmts_rule(p)) // simple_stmts
)
{
(fprintf(stderr, "%*c+ statement_newline[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "simple_stmts"));
D= simple_stmts_var;
_res goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s statement_newline[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "simple_stmts"));
p}
{ // NEWLINE
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> statement_newline[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "NEWLINE"));
D* newline_var;
Token if (
(newline_var = _PyPegen_expect_token(p, NEWLINE)) // token='NEWLINE'
)
{
(fprintf(stderr, "%*c+ statement_newline[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "NEWLINE"));
D*_token = _PyPegen_get_last_nonnwhitespace_token(p);
Token if (_token == NULL) {
->level--;
preturn NULL;
}
int _end_lineno = _token->end_lineno;
(_end_lineno); // Only used by EXTRA macro
UNUSEDint _end_col_offset = _token->end_col_offset;
(_end_col_offset); // Only used by EXTRA macro
UNUSED= ( asdl_stmt_seq* ) _PyPegen_singleton_seq ( p , CHECK ( stmt_ty , _PyAST_Pass ( EXTRA ) ) );
_res if (_res == NULL && PyErr_Occurred()) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s statement_newline[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "NEWLINE"));
p}
{ // $
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> statement_newline[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "$"));
D* endmarker_var;
Token if (
(endmarker_var = _PyPegen_expect_token(p, ENDMARKER)) // token='ENDMARKER'
)
{
(fprintf(stderr, "%*c+ statement_newline[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "$"));
D= _PyPegen_interactive_exit ( p );
_res if (_res == NULL && PyErr_Occurred()) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s statement_newline[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "$"));
p}
= NULL;
_res :
done->level--;
preturn _res;
}
// 并不短,可以看到函数里应对各种不同的需要换行的情况给出了对应的 asdl_stmt_seq,
// 当然,作为一个不懂编译原理的人,在这里我就需要小心翼翼地绕开这些 asdl打头的函数
// 在这里,我们可以关注到两个函数,分别是simple_stmts_rule,和 compound_stmt_rule,
// 显然,这两个函数就是语法规则了。
// 选一个simple吧
// simple_stmts: simple_stmt !';' NEWLINE | ';'.simple_stmt+ ';'? NEWLINE
// simple_stmts_rule函数非常长,并且里面其实是子啊重复调用simple_stmt_rule,
// 那么我们直接看这个simple_stmt_rule吧。
// 单个代码块太大会让我的编辑器很卡,所以我在这里换行一下
// simple_stmt:
// | assignment
// | star_expressions
// | &'return' return_stmt
// | &('import' | 'from') import_stmt
// | &'raise' raise_stmt
// | 'pass'
// | 'thanks'
// | &'del' del_stmt
// | &'yield' yield_stmt
// | &'assert' assert_stmt
// | 'break'
// | 'continue'
// | &'global' global_stmt
// | &'nonlocal' nonlocal_stmt
static stmt_ty
(Parser *p)
simple_stmt_rule{
if (p->level++ == MAXSTACK) {
->error_indicator = 1;
p();
PyErr_NoMemory}
if (p->error_indicator) {
->level--;
preturn NULL;
}
= NULL;
stmt_ty _res if (_PyPegen_is_memoized(p, simple_stmt_type, &_res)) {
->level--;
preturn _res;
}
int _mark = p->mark;
if (p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
int _start_lineno = p->tokens[_mark]->lineno;
(_start_lineno); // Only used by EXTRA macro
UNUSEDint _start_col_offset = p->tokens[_mark]->col_offset;
(_start_col_offset); // Only used by EXTRA macro
UNUSED{ // assignment
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "assignment"));
D;
stmt_ty assignment_varif (
(assignment_var = assignment_rule(p)) // assignment
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "assignment"));
D= assignment_var;
_res goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "assignment"));
p}
{ // star_expressions
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "star_expressions"));
D;
expr_ty eif (
(e = star_expressions_rule(p)) // star_expressions
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "star_expressions"));
D*_token = _PyPegen_get_last_nonnwhitespace_token(p);
Token if (_token == NULL) {
->level--;
preturn NULL;
}
int _end_lineno = _token->end_lineno;
(_end_lineno); // Only used by EXTRA macro
UNUSEDint _end_col_offset = _token->end_col_offset;
(_end_col_offset); // Only used by EXTRA macro
UNUSED= _PyAST_Expr ( e , EXTRA );
_res if (_res == NULL && PyErr_Occurred()) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "star_expressions"));
p}
{ // &'return' return_stmt
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "&'return' return_stmt"));
D;
stmt_ty return_stmt_varif (
(1, _PyPegen_expect_token, p, 520) // token='return'
_PyPegen_lookahead_with_int&&
(return_stmt_var = return_stmt_rule(p)) // return_stmt
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "&'return' return_stmt"));
D= return_stmt_var;
_res goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "&'return' return_stmt"));
p}
{ // &('import' | 'from') import_stmt
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "&('import' | 'from') import_stmt"));
D;
stmt_ty import_stmt_varif (
(1, _tmp_6_rule, p)
_PyPegen_lookahead&&
(import_stmt_var = import_stmt_rule(p)) // import_stmt
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "&('import' | 'from') import_stmt"));
D= import_stmt_var;
_res goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "&('import' | 'from') import_stmt"));
p}
{ // &'raise' raise_stmt
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "&'raise' raise_stmt"));
D;
stmt_ty raise_stmt_varif (
(1, _PyPegen_expect_token, p, 523) // token='raise'
_PyPegen_lookahead_with_int&&
(raise_stmt_var = raise_stmt_rule(p)) // raise_stmt
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "&'raise' raise_stmt"));
D= raise_stmt_var;
_res goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "&'raise' raise_stmt"));
p}
{ // 'pass'
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "'pass'"));
D* _keyword;
Token if (
(_keyword = _PyPegen_expect_token(p, 504)) // token='pass'
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "'pass'"));
D*_token = _PyPegen_get_last_nonnwhitespace_token(p);
Token if (_token == NULL) {
->level--;
preturn NULL;
}
int _end_lineno = _token->end_lineno;
(_end_lineno); // Only used by EXTRA macro
UNUSEDint _end_col_offset = _token->end_col_offset;
(_end_col_offset); // Only used by EXTRA macro
UNUSED= _PyAST_Pass ( EXTRA );
_res if (_res == NULL && PyErr_Occurred()) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "'pass'"));
p}
{ // 'thanks'
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "'thanks'"));
D* _keyword;
Token if (
(_keyword = _PyPegen_expect_token(p, 505)) // token='thanks'
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "'thanks'"));
D*_token = _PyPegen_get_last_nonnwhitespace_token(p);
Token if (_token == NULL) {
->level--;
preturn NULL;
}
int _end_lineno = _token->end_lineno;
(_end_lineno); // Only used by EXTRA macro
UNUSEDint _end_col_offset = _token->end_col_offset;
(_end_col_offset); // Only used by EXTRA macro
UNUSED= _PyAST_Thanks ( EXTRA );
_res if (_res == NULL && PyErr_Occurred()) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "'thanks'"));
p}
{ // &'del' del_stmt
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "&'del' del_stmt"));
D;
stmt_ty del_stmt_varif (
(1, _PyPegen_expect_token, p, 604) // token='del'
_PyPegen_lookahead_with_int&&
(del_stmt_var = del_stmt_rule(p)) // del_stmt
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "&'del' del_stmt"));
D= del_stmt_var;
_res goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "&'del' del_stmt"));
p}
{ // &'yield' yield_stmt
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "&'yield' yield_stmt"));
D;
stmt_ty yield_stmt_varif (
(1, _PyPegen_expect_token, p, 574) // token='yield'
_PyPegen_lookahead_with_int&&
(yield_stmt_var = yield_stmt_rule(p)) // yield_stmt
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "&'yield' yield_stmt"));
D= yield_stmt_var;
_res goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "&'yield' yield_stmt"));
p}
{ // &'assert' assert_stmt
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "&'assert' assert_stmt"));
D;
stmt_ty assert_stmt_varif (
(1, _PyPegen_expect_token, p, 527) // token='assert'
_PyPegen_lookahead_with_int&&
(assert_stmt_var = assert_stmt_rule(p)) // assert_stmt
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "&'assert' assert_stmt"));
D= assert_stmt_var;
_res goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "&'assert' assert_stmt"));
p}
{ // 'break'
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "'break'"));
D* _keyword;
Token if (
(_keyword = _PyPegen_expect_token(p, 509)) // token='break'
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "'break'"));
D*_token = _PyPegen_get_last_nonnwhitespace_token(p);
Token if (_token == NULL) {
->level--;
preturn NULL;
}
int _end_lineno = _token->end_lineno;
(_end_lineno); // Only used by EXTRA macro
UNUSEDint _end_col_offset = _token->end_col_offset;
(_end_col_offset); // Only used by EXTRA macro
UNUSED= _PyAST_Break ( EXTRA );
_res if (_res == NULL && PyErr_Occurred()) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "'break'"));
p}
{ // 'continue'
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "'continue'"));
D* _keyword;
Token if (
(_keyword = _PyPegen_expect_token(p, 510)) // token='continue'
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "'continue'"));
D*_token = _PyPegen_get_last_nonnwhitespace_token(p);
Token if (_token == NULL) {
->level--;
preturn NULL;
}
int _end_lineno = _token->end_lineno;
(_end_lineno); // Only used by EXTRA macro
UNUSEDint _end_col_offset = _token->end_col_offset;
(_end_col_offset); // Only used by EXTRA macro
UNUSED= _PyAST_Continue ( EXTRA );
_res if (_res == NULL && PyErr_Occurred()) {
->error_indicator = 1;
p->level--;
preturn NULL;
}
goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "'continue'"));
p}
{ // &'global' global_stmt
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "&'global' global_stmt"));
D;
stmt_ty global_stmt_varif (
(1, _PyPegen_expect_token, p, 524) // token='global'
_PyPegen_lookahead_with_int&&
(global_stmt_var = global_stmt_rule(p)) // global_stmt
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "&'global' global_stmt"));
D= global_stmt_var;
_res goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "&'global' global_stmt"));
p}
{ // &'nonlocal' nonlocal_stmt
if (p->error_indicator) {
->level--;
preturn NULL;
}
(fprintf(stderr, "%*c> simple_stmt[%d-%d]: %s\n", p->level, ' ', _mark, p->mark, "&'nonlocal' nonlocal_stmt"));
D;
stmt_ty nonlocal_stmt_varif (
(1, _PyPegen_expect_token, p, 525) // token='nonlocal'
_PyPegen_lookahead_with_int&&
(nonlocal_stmt_var = nonlocal_stmt_rule(p)) // nonlocal_stmt
)
{
(fprintf(stderr, "%*c+ simple_stmt[%d-%d]: %s succeeded!\n", p->level, ' ', _mark, p->mark, "&'nonlocal' nonlocal_stmt"));
D= nonlocal_stmt_var;
_res goto done;
}
->mark = _mark;
p(fprintf(stderr, "%*c%s simple_stmt[%d-%d]: %s failed!\n", p->level, ' ',
D->error_indicator ? "ERROR!" : "-", _mark, p->mark, "&'nonlocal' nonlocal_stmt"));
p}
= NULL;
_res :
done(p, _mark, simple_stmt_type, _res);
_PyPegen_insert_memo->level--;
preturn _res;
}
// 可以看到,这里对所有的simple statement都做了实现,那些compound statement就是类似def之类。
// 选一个我喜欢的关键字pass吧,可以看到这样一段代码
// (_keyword = _PyPegen_expect_token (p, 504)) // token='pass'
// _PyPegen_expect_token 这个方法在之前的newline token那里也出现过,
// 所以,可见所有的token都是由这个方法生成的。
// 进去以后又是非常非常多层的调用,避免篇幅过长就不一一展示了,
// 最终可以确定到tokenizer.c里面的tok_get
// 也就是最早最早我在例子里说到的这个长达几百行的函数
// 给出一小段
static int
(struct tok_state *tok, const char **p_start, const char **p_end)
tok_get{
// printf("[start] tokenizer.c tok_get is called\n");
int c;
int blankline, nonascii;
*p_start = *p_end = NULL;
:
nextline->start = NULL;
tok= 0;
blankline
/* Get indentation level */
if (tok->atbol) {
int col = 0;
int altcol = 0;
->atbol = 0;
tokint cont_line_col = 0;
for (;;) {
= tok_nextc(tok);
c if (c == ' ') {
++, altcol++;
col}
else if (c == '\t') {
= (col / tok->tabsize + 1) * tok->tabsize;
col = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
altcol }
else if (c == '\014') {/* Control-L (formfeed) */
= altcol = 0; /* For Emacs users */
col }
else if (c == '\\') {
// Indentation cannot be split over multiple physical lines
// using backslashes. This means that if we found a backslash
// preceded by whitespace, **the first one we find** determines
// the level of indentation of whatever comes next.
= cont_line_col ? cont_line_col : col;
cont_line_col if ((c = tok_continuation_line(tok)) == -1) {
return ERRORTOKEN;
}
}
else {
break;
}
}
(tok, c);
tok_backupif (c == '#' || c == '\n') {
}
。。。。。。}
}
// 可以看到,这个方法所处理的第一个内容就是python的缩进,
// 对的,在python中,缩进是一个单独的token
// 可以看到下面紧接着处理的是注释和换行符
// 至于这个方法是如何获得我们写的文本,我们进入tok_nextc看一下
/* Get next char, updating state; error code goes into tok->done */
static int
(struct tok_state *tok)
tok_nextc{
int rc;
for (;;) {
if (tok->cur != tok->inp) {
return Py_CHARMASK(*tok->cur++); /* Fast path */
}
if (tok->done != E_OK) {
return EOF;
}
if (tok->fp == NULL) {
= tok_underflow_string(tok);
rc }
else if (tok->prompt != NULL) {
= tok_underflow_interactive(tok);
rc }
else {
= tok_underflow_file(tok);
rc }
#if defined(Py_DEBUG)
if (tok->debug) {
(stderr, "line[%d] = ", tok->lineno);
fprintf(stderr, tok->cur, tok->inp - tok->cur);
print_escape(stderr, " tok->done = %d\n", tok->done);
fprintf}
#endif
if (!rc) {
->cur = tok->inp;
tokreturn EOF;
}
->line_start = tok->cur;
tok}
();
Py_UNREACHABLE}
// tok_state 主要是存储文件阅读的情况,比如现在tokenizer正读到第几个字就是tok->cur
// 这里就很清楚的看到,用Py_CHARMASK这个宏转换了一下tok->cur的下一个字符
// /* Argument must be a char or an int in [-128, 127] or [0, 255]. */
// #define Py_CHARMASK(c) ((unsigned char)((c) & 0xff))
// 宏的定义在这里
// 这么看来,我写的所有python文本都在这个存在tok_state里面,
// 的确,如果去看tok_state的实现,可以看到其第一个参数就是buf,
// 存储的就是字符串形式的文本信息。
// 而这个文本信息自从被打开以后就一直存在FILE fp里面,作为参数传递
// 直到 _PyPegen_run_parser_from_file_pointer 函数,有这么一行
// struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
// fp的信息被写进了tok_state,而后,
// Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
// errcode, arena);
// tok 这个变量又被作为参数用于生成 parser 了。
// 再往后这个parser就一直作为变量传递到底层,
// 一直到tokenizer.c这个文件之后,p->tok 这个参数又重新被拿出来使用。
// 也正因为如此,在最底层的 tok_nextc 才可以持续读出字符
// 终于,我们已经完成了全部全部的步骤,知道当我们在python里敲下回车的时候都发生了什么
// 虽然很不完全,呵呵
至此,在小心翼翼地绕开了AST部分之后,代码的tracing就已经结束了。
回头看来,每个函数的结构都不复杂,但是调用层数多达几十上百层以后确实让人眼花缭乱。
一个较大项目的源码并不是说啃就能啃下来的,特别是在翻遍了国内外的互联网也没有找到详细的资料的时候。所以,借此机会,我决定记录一下我的探索,虽然不一定能帮到后来者。
最终,弄明白一切之后,我也就仅仅给尚未新生的pPy添加了一个关键字,thanks,语法和pass无异。然后,我写了我的第一个,也许也是全世界唯一一份pPy程序,只有一个词,thanks。它会一路从main调用到tok_nextc,然后再返回,但是什么都达成不了,就和没做一样,就和我做的这些一样。不过,至少编译通过的时候我还是挺开心的,我知道后续若想让pPy可以实现a++的语法也不难了。
在长达或是短至十个小时的挣扎后,我们可怜的pPy算是寿终正寝或是胎死腹中了。我自问,潜下心来,一点点在CPython里把这个小项目做出来并不是一件不可能的事,但是,我哪里来的这些时间呢?这个没有生产力的小项目做出来又有什么用呢?我花费的这些时间和心血最终能有回报吗?
临时兴起很容易,做下去很难。
pPy死了,和我曾打过的许多小算盘埋在一起,没有坟墓。