斷詞系統長詞優先法分為: 正向及反向,會產生岐異與未知詞上的問題,若是詞庫夠大就能解決了。 這裡只範例長詞優先法之程式。
已解決了正反向優先詞重覆字問題 2018/4/09 15:00
已添加詞位標籤 2018/4/09 15:00
語料庫下載_Mysql
------------SQL------------
using System.Collections.Generic;
using System.Data;
using System;
using MySql.Data.MySqlClient;
using System.Text;
namespace SQL
{
public class SQL
{
public static MySqlConnection mySqlConnection;
// DataBase Ip;
private static string HOST = "SQL IP";
public static string HOST_TYPE
{
set { HOST = value; }
get { return HOST; }
}
// DataBase user Id
private static string ID = "SQL USER ID";
public static string ID_TYPE
{
set { ID = value; }
get { return ID; }
}
// DataBase password
private static string PASSWORD = "SQL PASSWORD";
public static string PASSWORD_TYPE
{
set { PASSWORD = value; }
get { return PASSWORD; }
}
// DataBase Name
private static string DATABASE = "DATABASE_NAME";
public static string DATABASE_TYPE
{
set { DATABASE = value; }
get { return DATABASE; }
}
private static string PORT = "SQL PORT";
public static string PORT_TYPE
{
set { PORT = value; }
get { return PORT; }
}
// 連線結果
public static string RESULT = "";
// 訊息
public static List string MESSAGE = new List string ();
// Connect to SQL;
public static void openSqlConnection(string connction)
{
try
{
mySqlConnection = new MySqlConnection(connction);
mySqlConnection.Open();
RESULT = mySqlConnection.ServerVersion;
}
catch
{
RESULT = "無法連接資料庫";
}
}
// Close SQL
public static void closeSqlConnection()
{
mySqlConnection.Close();
mySqlConnection.Dispose();
mySqlConnection = null;
}
// SQL Query
public static void DOQUERY(string sqlQuery)
{
IDbCommand dbcommand = mySqlConnection.CreateCommand();
dbcommand.CommandText = sqlQuery;
IDataReader reader = dbcommand.ExecuteReader();
reader.Close();
reader = null;
dbcommand.Dispose();
dbcommand = null;
}
#region Get DataSet
public static DataSet GetDataSet(string sqlString)
{
DataSet ds = new DataSet();
try
{
MySqlDataAdapter da = new MySqlDataAdapter(sqlString, mySqlConnection);
}
catch (Exception e)
{
throw new Exception("SQL:" + sqlString + "\n");
}
return ds;
}
#endregion
// string to utf8
public static string STRING_UTF8(string messae)
{
UTF8Encoding encoder = new UTF8Encoding();
byte[] bytes = Encoding.UTF8.GetBytes(messae);
string utf8ReturnString = encoder.GetString(bytes);
return utf8ReturnString;
}
// 詞位標籤
public static string INQUIRE_Lexical(string dataBaseTitle, string message)
{
string sqlText = "select * from " + dataBaseTitle + " where Message='" + message + "'";
string str1 = "";
MySqlCommand cmd = new MySqlCommand(sqlText, mySqlConnection);
MySqlDataReader data = cmd.ExecuteReader();
while (data.Read())
{
try
{
str1 = data[5].ToString();
}
finally
{
}
}
data.Close();
return str1;
}
// 資料庫查詢_Message(詞)
// ※ 斷詞用_撈出可能性
public static void INQUIRE_TABLE(string dataBaseTitle, string message)
{
string sqlText = "select * from " + dataBaseTitle + " where Message='" + message + "'";
MySqlCommand cmd = new MySqlCommand(sqlText, mySqlConnection);
MySqlDataReader data = cmd.ExecuteReader();
while (data.Read())
{
try
{
MESSAGE.Add(data[0].ToString());
}
finally
{
}
}
data.Close();
}
// 清除
public static void MYSQL_MESSAGE_CLERA()
{
MESSAGE.Clear();
}
}
}
------------斷字處理功能------------
using System.Collections.Generic;
using UnityEngine;
using System.Collections;
using System.Text.RegularExpressions;
public class SegmentationSymbol
{
// 斷詞標記括號
private string[] specialSymbol = new string[] { "(", ")" };
// 斷字狀態
private string statusBroken;
// 刪除符號字元 狀態: 1
public string delectSymbol(string userMessage)
{
string strs = userMessage;
for (int x = 0; x < specialSymbol.Length; x++)
strs = strs.Replace(specialSymbol[x], " ");
return strs;
}
// 斷字字數
private void status_conter(int status)
{
statusBroken = "";
for (int x = 0; x <= status; x++)
statusBroken += @"\S";
}
// 斷字狀態
public List<match> brokenSatus(string message, int status)
{
status_conter(status);
List<match> match = new List<match>();
MatchCollection splitResult = Regex.Matches(message, statusBroken, RegexOptions.IgnoreCase);
foreach (Match test in splitResult)
match.Add(test);
return match;
}
// 刪除訊息
public string delectWord(string message, int status)
{
string str = message;
return str.Remove(status).ToString();
}
// 組合處理
public List<string> delect_Word_Combination(string message, int status)
{
status_conter(status);
string str = message;
List<string> match = new List<string>();
List<string> stringList = new List<string>();
MatchCollection splitResult = Regex.Matches(str, statusBroken, RegexOptions.IgnoreCase);
foreach (Match test in splitResult)
match.Add(test.ToString());
foreach (string strs in match)
{
stringList.Add(str.Replace(strs, ""));
return stringList;
}
return stringList;
}
// 處理斷一次訊息
public List<string> delect_oneWord(string message, int status)
{
status_conter(status);
string str = message;
List<string> match = new List<string>();
List<string> stringList = new List<string>();
MatchCollection splitResult = Regex.Matches(str, statusBroken, RegexOptions.IgnoreCase);
foreach (Match test in splitResult)
match.Add(test.ToString());
foreach (string strs in match)
stringList.Add(str.Replace(strs, ""));
return stringList;
}
}
------------斷詞程式------------
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System.Text.RegularExpressions;
using System;
public class LongWordsFirst : MonoBehaviour
{
public string str = "近年來知名夜市", logeWordForward, logeWordReverse, combination_string;
private SegmentationSymbol segmentationSymbol = new SegmentationSymbol();
private List<string> listMessage = new List<string>();
private int messageNumber, messageNumber2, forward_Num;
// 正向長詞優先法
private List<string> longeWordForward_Message = new List<string>();
// 正向長詞優先校正
private List<string> forward_exclufr = new List<string>();
// 反向長詞優先法
private List<string> longeWordReverse_Message = new List<string>();
// 正向詞位標籤
private List<string> forwardLexical = new List<string>();
// 反向詞位標籤
private List<string> reverseLexical = new List<string>();
// 長詞優先與詞位標籤完成
public string fulfilForward, fulfilReverse, fulfil_ForwardLexical, fulfil_ReverseLexical;
// 斷字系統資料表,文法組合資料表
public static string dataBasetitle = "glossary", dataBasetitle2 = "grammar";
void Start()
{
openDataBase();
}
// 打開資料庫
void openDataBase()
{
string connectionString = string.Format("Server = {0}; Database = {1}; UserID = {2}; Password = {3}; Port = {4};", CMySql.HOST_TYPE, CMySql.DATABASE_TYPE, CMySql.ID_TYPE, CMySql.PASSWORD_TYPE, CMySql.PORT_TYPE);
SQL.SQL.openSqlConnection(connectionString);
}
void OnGUI()
{
if (GUILayout.Button("測試"))
{
cutting_();
}
if (GUILayout.Button("詞位"))
{
// 導正順序
StartCoroutine(order());
}
}
void OnApplicationQuit()
{
SQL.SQL.closeSqlConnection();
}
// 切字
private void cutting_()
{
listClear();
// 正向優先輸入訊息
logeWordForward = str;
logeWordReverse = str;
combination_string = str;
messageNumber = logeWordReverse.Length;
messageNumber2 = logeWordForward.Length;
forward_Num = combination_string.Length;
StartCoroutine(loneWordForward(logeWordForward.Length));
StartCoroutine(longWordReverse(0));
// 長詞詞位標籤
StartCoroutine(forwardLable());
StartCoroutine(reverseLable());
// 長詞優先法校正
StartCoroutine(ForwardAnalysis(0));
}
IEnumerator order()
{
while (true)
{
if (combination_string.Length == 0)
{
for (int x = forward_exclufr.Count - 1; x >= 0; x--)
fulfilForward += forward_exclufr[x] + "|";
for (int x = longeWordReverse_Message.Count - 1; x >= 0; x--)
fulfilReverse += longeWordReverse_Message[x] + "|";
for (int x = forwardLexical.Count - 1; x >= 0; x--)
fulfil_ForwardLexical += forwardLexical[x] + "|";
for (int x = reverseLexical.Count - 1; x >= 0; x--)
fulfil_ReverseLexical += reverseLexical[x] + "|";
break;
}
yield return new WaitForSeconds(0.5f);
}
}
// 反向長詞詞位標籤
IEnumerator reverseLable()
{
int x = 0;
while (true)
{
try
{
if (x < longeWordReverse_Message.Count)
{
reverseLexical.Add(SQL.SQL.INQUIRE_Lexical(dataBasetitle, longeWordReverse_Message[x]));
x++;
}
}
catch
{
break;
}
yield return new WaitForSeconds(0.5f);
}
}
// 正向長詞詞位標籤
IEnumerator forwardLable()
{
int x = 0;
while (true)
{
try
{
if (x < forward_exclufr.Count)
{
forwardLexical.Add(SQL.SQL.INQUIRE_Lexical(dataBasetitle, forward_exclufr[x]));
x++;
}
}
catch
{
break;
}
yield return new WaitForSeconds(0.5f);
}
}
// 正向長詞優先法
IEnumerator loneWordForward(int lenght)
{
while (true)
{
foreach (Match match1 in segmentationSymbol.brokenSatus(logeWordForward, logeWordForward.Length))
{
SQL.SQL.INQUIRE_TABLE(dataBasetitle, match1.ToString());
foreach (string sqlMessage in SQL.SQL.MESSAGE)
{
if (sqlMessage == match1.ToString())
{
longeWordForward_Message.Add(match1.ToString());
logeWordForward = segmentationSymbol.delectWord(logeWordForward, messageNumber2);
}
}
SQL.SQL.MYSQL_MESSAGE_CLERA();
}
foreach (Match match in segmentationSymbol.brokenSatus(logeWordForward, messageNumber2))
{
SQL.SQL.INQUIRE_TABLE(dataBasetitle, match.ToString());
foreach (string str in SQL.SQL.MESSAGE)
{
if (str == match.ToString())
longeWordForward_Message.Add(str);
logeWordForward = inspection(logeWordForward, longeWordForward_Message);
messageNumber2 = logeWordForward.Length;
}
SQL.SQL.MYSQL_MESSAGE_CLERA();
}
if (messageNumber2 >= 0)
{
messageNumber2 -= 1;
}
yield return new WaitForSeconds(0.5f);
}
}
// 後向長詞優先法
IEnumerator longWordReverse(int lenght)
{
while (true)
{
foreach (Match match in segmentationSymbol.brokenSatus(logeWordReverse, logeWordReverse.Length))
{
SQL.SQL.INQUIRE_TABLE(dataBasetitle, match.ToString());
foreach (string str in SQL.SQL.MESSAGE)
{
if (str == match.ToString())
{
longeWordReverse_Message.Add(match.ToString());
logeWordReverse = segmentationSymbol.delectWord(logeWordReverse, match.Length);
}
}
}
SQL.SQL.INQUIRE_TABLE(dataBasetitle, logeWordReverse);
foreach (string str in segmentationSymbol.delect_Word_Combination(logeWordReverse, lenght))
{
SQL.SQL.INQUIRE_TABLE(dataBasetitle, str);
foreach (string sqlMessage in SQL.SQL.MESSAGE)
{
if (sqlMessage == str)
{
longeWordReverse_Message.Add(sqlMessage);
logeWordReverse = segmentationSymbol.delectWord(logeWordReverse, logeWordReverse.Length - sqlMessage.Length);
lenght = 0;
}
}
}
lenght += 1;
foreach (string str22 in SQL.SQL.MESSAGE)
{
if (str22 == logeWordReverse)
{
longeWordReverse_Message.Add(logeWordReverse);
logeWordReverse = segmentationSymbol.delectWord(logeWordReverse, logeWordReverse.Length - str22.Length);
}
}
if (lenght >= combination_string.Length)
lenght = 0;
yield return new WaitForSeconds(0.5f);
}
}
// 正向長詞優先法校正
IEnumerator ForwardAnalysis(int lenght)
{
while (true)
{
foreach (string str in segmentationSymbol.delect_Word_Combination(combination_string, lenght))
{
Debug.Log(str);
for (int x = 0; x < longeWordForward_Message.Count; x++)
{
if (str == longeWordForward_Message[x])
{
forward_exclufr.Add(str);
combination_string = segmentationSymbol.delectWord(combination_string, combination_string.Length - longeWordForward_Message[x].Length);
longeWordForward_Message.Remove(str);
lenght = 0;
}
}
}
lenght += 1;
for (int x = 0; x < longeWordForward_Message.Count; x++)
{
if (combination_string == longeWordForward_Message[x])
{
forward_exclufr.Add(combination_string);
combination_string = inspection(combination_string, forward_exclufr);
longeWordForward_Message.Remove(str);
lenght = 0;
}
}
if (lenght >= combination_string.Length)
lenght = 0;
yield return new WaitForSeconds(0.5f);
}
}
// 檢查沒有數據
private string inspection(string message, List<string> list)
{
string str = message;
foreach (string str1 in list)
{
str = str.Replace(str1, "");
}
return str;
}
// list 清除
private void listClear()
{
longeWordForward_Message.Clear();
longeWordReverse_Message.Clear();
forward_exclufr.Clear();
}
}
------------結果圖------------