他主要是先建構出類似於自動機的圖,有很多個狀態在跳來跳去,在我的實作裡有將其轉成語法樹的方法。
這裡需要自己實作的有lexer,還有最後的語法一定要有一個gamma rule,gamma rule就是有一個gamma \Longrightarrow S,S是初始符號。
首先是語法規則的部分:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
struct Rule{ | |
vector<vector<Rule*> > p; | |
void add(const vector<Rule*> &l){ | |
p.push_back(l); | |
} | |
}; | |
map<string,Rule*> NameRule; | |
map<Rule*,string> RuleName; | |
inline void init_Rule(){ | |
for(auto r:RuleName)delete r.first; | |
RuleName.clear(); | |
NameRule.clear(); | |
} | |
inline Rule *add_rule(const string &s){ | |
if(NameRule.find(s)!=NameRule.end())return NameRule[s]; | |
Rule *r=new Rule(); | |
RuleName[r]=s; | |
NameRule[s]=r; | |
return r; | |
} | |
//範例 | |
inline Rule *get_my_Rule(){ | |
//可以由題目輸入語法,也可以自己設定 | |
Rule *S=add_rule("S"),*E=add_rule("E"),*L=add_rule("L"); | |
Rule *list=add_rule("("),*AND=add_rule(")"),*T=add_rule("T"); | |
S->add({list,E}); | |
S->add({list,L}); | |
L->add({E,L}); | |
L->add({E,AND,E}); | |
E->add({T}); | |
E->add({S}); | |
Rule *GAMMA=add_rule("GAMMA");//一定要有gamma rule當作是最上層的語法 | |
GAMMA->add({S}); | |
return GAMMA; | |
} |
這是分析器的本體,lexer的部分要自己去完成,lexer的結果會是parse的第二個參數,計算出來一些資料會存在table裡面
以下為模板:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
typedef vector<Rule*> production; | |
struct State{ | |
Rule *r; | |
int rid,dot_id,start,end; | |
State(Rule *r,int rid,int dot,int start):r(r),rid(rid),dot_id(dot),start(start),end(-1){} | |
State(Rule *r=0,int col=0):r(r),rid(-1),dot_id(-1),start(-1),end(col){} | |
bool completed()const{ | |
return rid==-1||dot_id>=(int)r->p[rid].size(); | |
} | |
Rule *next_term()const{ | |
if(completed())return 0; | |
return r->p[rid][dot_id]; | |
} | |
bool operator<(const State& b)const{ | |
if(start!=b.start)return start<b.start; | |
if(dot_id!=b.dot_id)return dot_id<b.dot_id; | |
if(r!=b.r)return r<b.r; | |
return rid<b.rid; | |
} | |
void print()const{ | |
cout<<RuleName[r]<<"->"; | |
if(rid!=-1)for(size_t i=0;;++i){ | |
if((int)i==dot_id)cout<<" "<<"$"; | |
if(i>=r->p[rid].size())break; | |
cout<<" "<<RuleName[r->p[rid][i]]; | |
} | |
cout<<" "<<"["<<start<<","<<end<<"]"<<endl; | |
} | |
}; | |
struct Column{ | |
Rule *term; | |
string value; | |
vector<State> s; | |
map<State,set<pair<State,State> > > div; | |
//div比較像一棵 左兄右子的樹 | |
Column(Rule *r,const string &s):term(r),value(s){} | |
Column(){} | |
bool add(const State &st,int col){ | |
if(div.find(st)==div.end()){ | |
div[st]; | |
s.push_back(st); | |
s.back().end=col; | |
return true; | |
}else return false; | |
} | |
}; | |
inline vector<Column> lexer(string text){ | |
//tokenize,要自己寫 | |
//他會把 input stream 變成 token stream,就是(terminal,value)pair | |
vector<Column> token; | |
//自己寫的地方 | |
return token; | |
} | |
vector<Column> table; | |
inline void predict(int col,Rule *rul){ | |
for(size_t i=0;i<rul->p.size();++i){ | |
table[col].add(State(rul,i,0,col),col); | |
} | |
} | |
inline void scan(int col,const State &s,Rule *r){ | |
if(r!=table[col].term)return; | |
State ns(s.r,s.rid,s.dot_id+1,s.start); | |
table[col].add(ns,col); | |
table[col].div[ns].insert(make_pair(s,State(r,col))); | |
} | |
inline void complete(int col,const State &s){ | |
for(size_t i=0;i<table[s.start].s.size();++i){ | |
State &st=table[s.start].s[i]; | |
Rule *term=st.next_term(); | |
if(!term||term->p.size()==0)continue; | |
if(term==s.r){ | |
State nst(st.r,st.rid,st.dot_id+1,st.start); | |
table[col].add(nst,col); | |
table[col].div[nst].insert(make_pair(st,s)); | |
} | |
} | |
} | |
inline pair<bool,State> parse(Rule *GAMMA,const vector<Column > &token){ | |
table.resize(token.size()+1); | |
for(size_t i=0;i<token.size();++i){ | |
table[i+1]=Column(token[i]); | |
} | |
table[0]=Column(); | |
table[0].add(State(GAMMA,0,0,0),0); | |
for(size_t i=0;i<table.size();++i){ | |
for(size_t j=0;j<table[i].s.size();++j){ | |
State state=table[i].s[j]; | |
if(state.completed())complete(i,state); | |
else{ | |
Rule *term=state.next_term(); | |
if(term->p.size())predict(i,term); | |
else if(i+1<table.size())scan(i+1,state,term); | |
} | |
} | |
} | |
for(size_t i=0;i<table.back().s.size();++i){ | |
if(table.back().s[i].r==GAMMA&&table.back().s[i].completed()){ | |
return make_pair(true,table.back().s[i]); | |
} | |
} | |
return make_pair(false,State(0,-1)); | |
} |
最後是構造語法樹的部分,因為div是一顆左兄弟右兒子的樹,所以要將她轉換成正常的語法樹,還要去記錄曖昧文法的部分:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
struct node{//語法樹的節點 | |
State s; | |
vector<vector<node*> > child;//vector<node*>.size()>1表示ambiguous | |
node(const State &s):s(s){} | |
node(){} | |
}; | |
struct State_end_cmp{ | |
bool operator()(const State &a,const State &b)const{ | |
return a.end<b.end||(a.end==b.end&&a<b); | |
} | |
}; | |
map<State,node*,State_end_cmp> cache; | |
vector<node*> node_set; | |
inline void init_cache(){ | |
for(auto d:node_set)delete d; | |
cache.clear(); | |
node_set.clear(); | |
} | |
void _build_tree(const State &s,node *pa,bool amb=0){ | |
if(cache.find(s)!=cache.end()){ | |
pa->child.push_back(vector<node*>(1,cache[s])); | |
return; | |
} | |
node *o; | |
if(s.completed()){ | |
o=new node(s); | |
if(amb)pa->child.back().push_back(o); | |
else pa->child.push_back(vector<node*>(1,o)); | |
}else o=pa->child.back().back(); | |
amb=0; | |
for(auto div:table[s.end].div[s]){ | |
if(!amb)_build_tree(div.first,pa); | |
_build_tree(div.second,o,amb); | |
amb=1; | |
} | |
if(s.completed())cache[s]=o; | |
} | |
inline node *build_tree(const State &s){ | |
init_cache(); | |
node o; | |
_build_tree(s,&o); | |
assert(o.child.size()==1); | |
assert(o.child.back().size()==1); | |
return o.child.back().back(); | |
} | |
void print_tree(node *o,int dep=0){ | |
cout<<string(dep,' '),o->s.print(); | |
for(auto div:o->child){ | |
for(auto nd:div){ | |
print_tree(nd,dep+2); | |
} | |
} | |
} |
沒有留言:
張貼留言