Skip to content

Instantly share code, notes, and snippets.

@beta
Created November 14, 2015 12:52
Show Gist options
  • Save beta/4a1a98774ed3c31f4e91 to your computer and use it in GitHub Desktop.
Save beta/4a1a98774ed3c31f4e91 to your computer and use it in GitHub Desktop.
A simple lexer for C programs.
package lexer;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
public class Lexer {
private String mInputFile;
private List<String> mSymbols;
private long mCharCount = 0;
private long mWordCount = 0;
private long mLineCount = 0;
private Boolean mIsInComment = false;
private final static String[] KEYWORDS = {
"auto", "break", "case", "char", "const", "continue", "default", "do",
"double", "else", "enum", "extern", "float", "for", "goto", "if", "int",
"long", "register", "return", "short", "signed", "sizeof", "static",
"struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while"
};
private final static String[] OPERATORS = {
"<", ">", "<=", ">=", "==", "!=", "+", "-", "*", "/", "++", "--", "&",
"<<", ">>", "&", "|", "^", "&&", "||", "!", "?", ":", "=", "+=", "-=",
"*=", "/=", "%=", "<<=", ">>=", "&=", "^=", "|="
};
private final static String[] DELIMITERS = {
",",";", "(", ")", "[", "]", "{", "}"
};
public Lexer(String inputFile) {
mInputFile = inputFile;
}
public void run() {
mSymbols = new ArrayList<>();
String result = "";
List<String> lines = this.readFile();
for (String line : lines) {
if (!line.startsWith("#")) { // 忽略宏指令
result += processLine(line);
}
}
System.out.println("字符数:" + mCharCount);
System.out.println("单词数:" + mWordCount);
System.out.println("行数:" + mLineCount);
System.out.println();
System.out.println("符号表:");
for (int i = 0; i < mSymbols.size(); i++) {
System.out.printf("%d\t%s\n", i, mSymbols.get(i));
}
System.out.println();
System.out.println("记号流:");
System.out.println(result);
}
private List<String> readFile() {
try {
mCharCount = Files.size(Paths.get(mInputFile));
List<String> lines = Files.readAllLines(Paths.get(mInputFile));
mLineCount = lines.size();
return lines;
} catch (IOException ex) {
System.err.println("无法打开文件 " + mInputFile);
ex.printStackTrace();
System.exit(2);
}
return null;
}
/**
* 处理一行代码。
*
* @param line 要处理的代码
* @return 处理后的一行记号流
*/
private String processLine(String line) {
String processedLine = "";
for (int i = 0; i < line.length(); i++) {
Character c = line.charAt(i);
if (mIsInComment) {
if (i < line.length() - 1) {
Character cNext = line.charAt(i + 1);
if (c.equals('*') && cNext.equals('/')) {
mIsInComment = false;
i++;
continue;
} else {
continue;
}
} else {
continue;
}
}
if (i < line.length() - 1) {
Character cNext = line.charAt(i + 1);
if (c.equals('/')) {
if (cNext.equals('/')) {
// 单行注释
return processedLine;
} else if (cNext.equals('*')) {
// 多行注释
mIsInComment = true;
i++;
continue;
}
}
}
if (c == '_' || isAlphabet(c)) {
// 关键字或标识符
int j;
for (j = i; j < line.length(); j++) {
Character newChar = line.charAt(j);
if (!(newChar.equals('_') || isAlphabet(newChar) || isDigit(newChar))) {
break;
}
}
String name = line.substring(i, j);
i = j - 1;
if (isKeyword(name)) { // 关键字
processedLine += "(key, " + name + ")";
} else if (name.equals("sizeof")) { // sizeof 运算符
processedLine += "(op, sizeof)";
} else { // 标识符
int symbolId = addSymbol(name);
processedLine += "(id, " + symbolId + ")";
}
mWordCount++;
} else if (isDigit(c)) {
// 常数
int j;
for (j = i; j < line.length(); j++) {
Character newChar = line.charAt(j);
if (!(newChar.equals('.') || isDigit(newChar))) {
break;
}
}
String value = line.substring(i, j);
i = j - 1;
processedLine += "(const, " + value + ")";
mWordCount++;
} else if (c.equals('\'') || c.equals('"')) {
// 字符或字符串常量
int j;
for (j = i + 1; j < line.length(); j++) {
if (line.charAt(j) == c) {
break;
}
}
String value = line.substring(i, j + 1);
i = j;
processedLine += "(const, " + value + ")";
mWordCount += value.substring(1, value.length() - 1).split(" ").length;
} else if (isOperator(c.toString())) {
String op = c.toString();
if (i < line.length() - 2) {
String s = "" + line.charAt(i + 1) + line.charAt(i + 2);
if (isOperator(s)) {
op = s;
}
} else if (i < line.length() - 1) {
String s = "" + line.charAt(i + 1);
if (isOperator(s)) {
op = s;
}
}
processedLine += "(op, " + op + ")";
} else if (isDelimiter(c.toString())) {
processedLine += "(delim, " + c + ")";
}
}
return processedLine;
}
/**
* 将符号添加到符号表。
* 如果该符号已存在,则不再重复添加。
*
* @param symbol 要添加的符号
* @return 该符号在符号表的索引
*/
private int addSymbol(String symbol) {
if (!mSymbols.contains(symbol)) {
mSymbols.add(symbol);
}
return mSymbols.indexOf(symbol);
}
private Boolean isDigit(Character c) {
return Character.toString(c).matches("[0-9]");
}
private Boolean isAlphabet(Character c) {
return Character.toString(c).matches("[A-Za-z]");
}
private Boolean isKeyword(String s) {
for (String keyword : KEYWORDS) {
if (s.equals(keyword)) {
return true;
}
}
return false;
}
private Boolean isOperator(String s) {
for (String operator : OPERATORS) {
if (s.equals(operator)) {
return true;
}
}
return false;
}
private Boolean isDelimiter(String s) {
for (String delim : DELIMITERS) {
if (s.equals(delim)) {
return true;
}
}
return false;
}
public static void main(String[] args) {
if (args.length == 0) {
System.err.println("缺少文件名");
System.exit(1);
}
new Lexer(args[0]).run();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment