Java实现AC自动机全文检索示例

2025-05-29 0 46

第一步,构建Trie树,定义Node类型:

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25
/**

* Created by zhaoyy on 2017/2/7.

*/

interface Node {

char value();

boolean exists();

boolean isRoot();

Node parent();

Node childOf(char c);

Node fail();

void setFail(Node node);

void setExists(boolean exists);

void add(Node child);

List<Node> children();

}

第二步,实现两种Node,如果词汇全是可打印的ASCII字符,就采用AsciiNode,否则(比如包含汉字),使用基于hash表的MapNode;这两种Node均集成自AbstractNode:

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185
/**

* Created by zhaoyy on 2017/2/8.

*/

abstract class AbstractNode implements Node {

private static final char EMPTY = '\\0';

private final char value;

private final Node parent;

private boolean exists;

private Node fail;

public AbstractNode(Node parent, char value) {

this.parent = parent;

this.value = value;

this.exists = false;

this.fail = null;

}

public AbstractNode() {

this(null, EMPTY);

}

private static String tab(int n) {

StringBuilder builder = new StringBuilder();

for (int i = 0; i < n; i++) {

builder.append('\\t');

}

return builder.toString();

}

private static String toString(Node node, int depth) {

StringBuilder builder = new StringBuilder();

String tab = tab(depth);

Node fail = node.fail();

Node parent = node.parent();

builder

.append(tab)

.append('<')

.append(node.value())

.append(" exists=\\"")

.append(node.exists())

.append('"')

.append(" parent=\\"")

.append(parent == null ? "null" : parent.value())

.append('"')

.append(" fail=\\"")

.append(fail == null ? "null" : fail.value())

.append("\\">\\r\\n");

for (Node child : node.children())

builder.append(toString(child, depth + 1));

builder

.append(tab)

.append("</")

.append(node.value())

.append(">\\r\\n");

return builder.toString();

}

@Override

public char value() {

return value;

}

@Override

public boolean exists() {

return exists;

}

@Override

public boolean isRoot() {

return value == EMPTY;

}

@Override

public Node parent() {

return parent;

}

@Override

public Node fail() {

return fail;

}

@Override

public void setFail(Node node) {

this.fail = node;

}

@Override

public void setExists(boolean exists) {

this.exists = exists;

}

@Override

public String toString() {

return toString(this, 0);

}

}

/////////////////////////////////////////////////////////////////////////////////////////////

/**

* Created by zhaoyy on 2017/2/8.

*/

final class AsciiNode extends AbstractNode implements Node {

private static final char FROM = 32;

private static final char TO = 126;

private final Node[] children;

public AsciiNode() {

super();

this.children = new Node[TO - FROM + 1];

}

public AsciiNode(Node parent, char value) {

super(parent, value);

this.children = new Node[TO - FROM + 1];

}

@Override

public Node childOf(char c) {

if (c >= FROM && c <= TO)

return children[(int) c - FROM];

else return null;

}

@Override

public void add(Node child) {

int index = (int) child.value();

children[index - FROM] = child;

}

@Override

public List<Node> children() {

List<Node> nodes = new ArrayList<Node>();

for (Node child : children)

if (child != null)

nodes.add(child);

return nodes;

}

}

//////////////////////////////////////////////////////////////////////////////////////////////

/**

* Created by zhaoyy on 2017/2/8.

*/

final class MapNode extends AbstractNode implements Node {

private final Map<Character, Node> children;

public MapNode() {

super();

this.children = new HashMap<Character, Node>();

}

public MapNode(Node parent, char value) {

super(parent, value);

this.children = new HashMap<Character, Node>();

}

@Override

public Node childOf(char c) {

return children.get(c);

}

@Override

public void add(Node child) {

children.put(child.value(), child);

}

@Override

public List<Node> children() {

List<Node> nodes = new ArrayList<Node>();

nodes.addAll(children.values());

return nodes;

}

}

第三步,

首先定义一个Node构造器:

?

1

2

3

4

5

6

7

8

9
/**

* Created by zhaoyy on 2017/2/8.

*/

public interface NodeMaker {

Node make(Node parent, char value);

Node makeRoot();

}

然后构建AC自动机,实现创建及查找方法:

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155
/**

* Created by zhaoyy on 2017/2/7.

*/

public final class WordTable {

private final Node root;

private WordTable(Collection<? extends CharSequence> words, NodeMaker maker) {

Node root = buildTrie(words, maker);

setFailNode(root);

this.root = root;

}

public static WordTable compile(Collection<? extends CharSequence> words) {

if (words == null || words.isEmpty())

throw new IllegalArgumentException();

final NodeMaker maker;

if (isAscii(words))

maker = new NodeMaker() {

@Override

public Node make(Node parent, char value) {

return new AsciiNode(parent, value);

}

@Override

public Node makeRoot() {

return new AsciiNode();

}

};

else maker = new NodeMaker() {

@Override

public Node make(Node parent, char value) {

return new MapNode(parent, value);

}

@Override

public Node makeRoot() {

return new MapNode();

}

};

return new WordTable(words, maker);

}

private static boolean isAscii(Collection<? extends CharSequence> words) {

for (CharSequence word : words) {

int len = word.length();

for (int i = 0; i < len; i++) {

int c = (int) word.charAt(i);

if (c < 32 || c > 126)

return false;

}

}

return true;

}

private static Node buildTrie(Collection<? extends CharSequence> sequences, NodeMaker maker) {

Node root = maker.makeRoot();

for (CharSequence sequence : sequences) {

int len = sequence.length();

Node current = root;

for (int i = 0; i < len; i++) {

char c = sequence.charAt(i);

Node node = current.childOf(c);

if (node == null) {

node = maker.make(current, c);

current.add(node);

}

current = node;

if (i == len - 1)

node.setExists(true);

}

}

return root;

}

private static void setFailNode(final Node root) {

root.setFail(null);

Queue<Node> queue = new LinkedList<Node>();

queue.add(root);

while (!queue.isEmpty()) {

Node parent = queue.poll();

Node temp;

for (Node child : parent.children()) {

if (parent.isRoot())

child.setFail(root);

else {

temp = parent.fail();

while (temp != null) {

Node node = temp.childOf(child.value());

if (node != null) {

child.setFail(node);

break;

}

temp = temp.fail();

}

if (temp == null)

child.setFail(root);

}

queue.add(child);

}

}

}

public boolean findAnyIn(CharSequence cs) {

int len = cs.length();

Node node = root;

for (int i = 0; i < len; i++) {

Node next = node.childOf(cs.charAt(i));

if (next == null) {

next = node.fail();

if (next == null) {

node = root;

continue;

}

}

if (next.exists())

return true;

}

return false;

}

public List<MatchInfo> search(CharSequence cs) {

if (cs == null || cs.length() == 0)

return Collections.emptyList();

List<MatchInfo> result = new ArrayList<MatchInfo>();

int len = cs.length();

Node node = root;

for (int i = 0; i < len; i++) {

Node next = node.childOf(cs.charAt(i));

if (next == null) {

next = node.fail();

if (next == null) {

node = root;

continue;

}

}

if (next.exists()) {

MatchInfo info = new MatchInfo(i, next);

result.add(info);

node = root;

continue;

}

node = next;

}

return result;

}

@Override

public String toString() {

return root.toString();

}

}

定义一个保存查找结果的实体:

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39
/**

* Created by zhaoyy on 2017/2/7.

*/

public final class MatchInfo {

private final int index;

private final String word;

public MatchInfo(int index, String word) {

this.index = index;

this.word = word;

}

public MatchInfo(int index, Node node) {

StringBuilder builder = new StringBuilder();

while (node != null) {

if (!node.isRoot())

builder.append(node.value());

node = node.parent();

}

String word = builder.reverse().toString();

this.index = index + 1 - word.length();

this.word = word;

}

public int getIndex() {

return index;

}

public String getWord() {

return word;

}

@Override

public String toString() {

return index + ":" + word;

}

}

第四步,调用Demo:

?

1

2

3

4

5

6
public static void main(String[] args) {

List<String> list = Arrays.asList("say", "her", "he", "she", "shr", "alone");

WordTable table = WordTable.compile(list);

System.out.println(table);

System.out.println(table.search("1shesaynothingabouthislivinghimalone"));

}

以下是输出结果:

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32
< exists="false" parent="null" fail="null">

<s exists="false" parent=" " fail=" ">

<a exists="false" parent="s" fail="a">

<y exists="true" parent="a" fail=" ">

</y>

</a>

<h exists="false" parent="s" fail="h">

<e exists="true" parent="h" fail="e">

</e>

<r exists="true" parent="h" fail=" ">

</r>

</h>

</s>

<h exists="false" parent=" " fail=" ">

<e exists="true" parent="h" fail=" ">

<r exists="true" parent="e" fail=" ">

</r>

</e>

</h>

<a exists="false" parent=" " fail=" ">

&lt;l exists="false" parent="a" fail=" ">

<o exists="false" parent="l" fail=" ">

<n exists="false" parent="o" fail=" ">

<e exists="true" parent="n" fail=" ">

</e>

</n>

</o>

&lt;/l>

</a>

</ >

[1:she, 4:say, 31:alone]

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持快网idc。

原文链接:https://my.oschina.net/u/2541538/blog/833284

收藏 (0) 打赏

感谢您的支持,我会继续努力的!

打开微信/支付宝扫一扫,即可进行扫码打赏哦,分享从这里开始,精彩与您同在
点赞 (0)

声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。

快网idc优惠网 建站教程 Java实现AC自动机全文检索示例 https://www.kuaiidc.com/119143.html

相关文章

发表评论
暂无评论