2018年4月11日 星期三

斷詞系統之長詞優先法與詞位標籤_測試版

斷詞系統長詞優先法分為: 正向及反向,會產生岐異與未知詞上的問題,若是詞庫夠大就能解決了。 這裡只範例長詞優先法之程式。

已解決了正反向優先詞重覆字問題  2018/4/09 15:00
已添加詞位標籤  2018/4/09 15:00

語料庫下載_Mysql

------------SQL------------

using System.Collections.Generic;
using System.Data;
using System;
using MySql.Data.MySqlClient;
using System.Text;

namespace SQL
{
    public class SQL
    {
        public static MySqlConnection mySqlConnection;

        // DataBase Ip;
        private static string HOST = "SQL IP";
        public static string HOST_TYPE
        {
            set { HOST = value; }
            get { return HOST; }
        }

        // DataBase user Id
        private static string ID = "SQL USER ID";
        public static string ID_TYPE
        {
            set { ID = value; }
            get { return ID; }
        }

        // DataBase password
        private static string PASSWORD = "SQL PASSWORD";
        public static string PASSWORD_TYPE
        {
            set { PASSWORD = value; }
            get { return PASSWORD; }
        }

        // DataBase Name
        private static string DATABASE = "DATABASE_NAME";
        public static string DATABASE_TYPE
        {
            set { DATABASE = value; }
            get { return DATABASE; }
        }

        private static string PORT = "SQL PORT";
        public static string PORT_TYPE
        {
            set { PORT = value; }
            get { return PORT; }
        }

        // 連線結果
        public static string RESULT = "";

        // 訊息
        public static List string  MESSAGE = new List string ();

        // Connect to SQL;
        public static void openSqlConnection(string connction)
        {
            try
            {
                mySqlConnection = new MySqlConnection(connction);
                mySqlConnection.Open();
                RESULT = mySqlConnection.ServerVersion;
            }
            catch
            {
                RESULT = "無法連接資料庫";
            }
        }

        // Close SQL
        public static void closeSqlConnection()
        {
            mySqlConnection.Close();
            mySqlConnection.Dispose();
            mySqlConnection = null;
        }

        // SQL Query
        public static void DOQUERY(string sqlQuery)
        {
            IDbCommand dbcommand = mySqlConnection.CreateCommand();
            dbcommand.CommandText = sqlQuery;
            IDataReader reader = dbcommand.ExecuteReader();
            reader.Close();
            reader = null;
            dbcommand.Dispose();
            dbcommand = null;
        }

        #region Get DataSet
        public static DataSet GetDataSet(string sqlString)
        {
            DataSet ds = new DataSet();

            try
            {
                MySqlDataAdapter da = new MySqlDataAdapter(sqlString, mySqlConnection);
            }
            catch (Exception e)
            {
                throw new Exception("SQL:" + sqlString + "\n");
            }
            return ds;
        }
        #endregion

        // string to utf8
        public static string STRING_UTF8(string messae)
        {
            UTF8Encoding encoder = new UTF8Encoding();
            byte[] bytes = Encoding.UTF8.GetBytes(messae);
            string utf8ReturnString = encoder.GetString(bytes);

            return utf8ReturnString;
        }

        // 詞位標籤
        public static string INQUIRE_Lexical(string dataBaseTitle, string message)
        {
            string sqlText = "select * from " + dataBaseTitle + " where Message='" + message + "'";
            string str1 = "";

            MySqlCommand cmd = new MySqlCommand(sqlText, mySqlConnection);
            MySqlDataReader data = cmd.ExecuteReader();

            while (data.Read())
            {
                try
                {
                    str1 = data[5].ToString();
                }
                finally
                {

                }
            }

            data.Close();

            return str1;
        }


        // 資料庫查詢_Message(詞)
        // ※ 斷詞用_撈出可能性
        public static void INQUIRE_TABLE(string dataBaseTitle, string message)
        {
            string sqlText = "select * from " + dataBaseTitle + " where Message='" + message + "'";

            MySqlCommand cmd = new MySqlCommand(sqlText, mySqlConnection);
            MySqlDataReader data = cmd.ExecuteReader();

            while (data.Read())
            {
                try
                {
                    MESSAGE.Add(data[0].ToString());
                }
                finally
                {

                }
            }

            data.Close();
        }

        // 清除
        public static void MYSQL_MESSAGE_CLERA()
        {
            MESSAGE.Clear();
        }
    }
}


------------斷字處理功能------------

using System.Collections.Generic;
using UnityEngine;
using System.Collections;
using System.Text.RegularExpressions;

public class SegmentationSymbol
{
    // 斷詞標記括號
    private string[] specialSymbol = new string[] { "(", ")" };

    // 斷字狀態
    private string statusBroken;

    // 刪除符號字元 狀態: 1
    public string delectSymbol(string userMessage)
    {
        string strs = userMessage;

        for (int x = 0; x < specialSymbol.Length; x++)
            strs = strs.Replace(specialSymbol[x], " ");

        return strs;
    }

    // 斷字字數
    private void status_conter(int status)
    {
        statusBroken = "";

        for (int x = 0; x <= status; x++)
            statusBroken += @"\S";
    }

  
    // 斷字狀態
    public List<match> brokenSatus(string message, int status)
    {
        status_conter(status);

        List<match> match = new List<match>();
        MatchCollection splitResult = Regex.Matches(message, statusBroken, RegexOptions.IgnoreCase);

        foreach (Match test in splitResult)
            match.Add(test);

        return match;
    }

    // 刪除訊息
    public string delectWord(string message, int status)
    {
        string str = message;

        return str.Remove(status).ToString();
    }

   
    //  組合處理
    public List<string> delect_Word_Combination(string message, int status)
    {
        status_conter(status);

        string str = message;
        List<string> match = new List<string>();
        List<string> stringList = new List<string>();

        MatchCollection splitResult = Regex.Matches(str, statusBroken, RegexOptions.IgnoreCase);
        foreach (Match test in splitResult)
            match.Add(test.ToString());
        foreach (string strs in match)
        {
            stringList.Add(str.Replace(strs, ""));

            return stringList;
        }

        return stringList;
    }

    
    //  處理斷一次訊息
    public List<string> delect_oneWord(string message, int status)
    {
        status_conter(status);

        string str = message;
        List<string> match = new List<string>();
        List<string> stringList = new List<string>();

        MatchCollection splitResult = Regex.Matches(str, statusBroken, RegexOptions.IgnoreCase);
        foreach (Match test in splitResult)
            match.Add(test.ToString());
        foreach (string strs in match)
            stringList.Add(str.Replace(strs, ""));

        return stringList;
    }
}


------------斷詞程式------------

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System.Text.RegularExpressions;
using System;

public class LongWordsFirst : MonoBehaviour
{

    public string str = "近年來知名夜市", logeWordForward, logeWordReverse, combination_string;

    private SegmentationSymbol segmentationSymbol = new SegmentationSymbol();
    private List<string> listMessage = new List<string>();
    private int messageNumber, messageNumber2, forward_Num;

    // 正向長詞優先法
    private List<string> longeWordForward_Message = new List<string>();

    // 正向長詞優先校正
    private List<string> forward_exclufr = new List<string>();

    // 反向長詞優先法
    private List<string> longeWordReverse_Message = new List<string>();

    // 正向詞位標籤
    private List<string> forwardLexical = new List<string>();
    // 反向詞位標籤
    private List<string> reverseLexical = new List<string>();

    // 長詞優先與詞位標籤完成
    public string fulfilForward, fulfilReverse, fulfil_ForwardLexical, fulfil_ReverseLexical;

    // 斷字系統資料表,文法組合資料表
    public static string dataBasetitle = "glossary", dataBasetitle2 = "grammar";

    void Start()
    {
        openDataBase();
    }

    // 打開資料庫
    void openDataBase()
    {
        string connectionString = string.Format("Server = {0}; Database = {1}; UserID = {2}; Password = {3}; Port = {4};", CMySql.HOST_TYPE, CMySql.DATABASE_TYPE, CMySql.ID_TYPE, CMySql.PASSWORD_TYPE, CMySql.PORT_TYPE);
        SQL.SQL.openSqlConnection(connectionString);
    }

    void OnGUI()
    {
        if (GUILayout.Button("測試"))
        {
            cutting_();
        }

        if (GUILayout.Button("詞位"))
        {
            // 導正順序
            StartCoroutine(order());
        }
    }

    void OnApplicationQuit()
    {
        SQL.SQL.closeSqlConnection();
    }

    // 切字
    private void cutting_()
    {
        listClear();

        // 正向優先輸入訊息
        logeWordForward = str;
        logeWordReverse = str;

        combination_string = str;

        messageNumber = logeWordReverse.Length;
        messageNumber2 = logeWordForward.Length;
        forward_Num = combination_string.Length;

        StartCoroutine(loneWordForward(logeWordForward.Length));
        StartCoroutine(longWordReverse(0));

        // 長詞詞位標籤
        StartCoroutine(forwardLable());
        StartCoroutine(reverseLable());

        // 長詞優先法校正
        StartCoroutine(ForwardAnalysis(0));
    }

    IEnumerator order()
    {
        while (true)
        {
            if (combination_string.Length == 0)
            {
                for (int x = forward_exclufr.Count - 1; x &gt;= 0; x--)
                    fulfilForward += forward_exclufr[x] + "|";

                for (int x = longeWordReverse_Message.Count - 1; x &gt;= 0; x--)
                    fulfilReverse += longeWordReverse_Message[x] + "|";

                for (int x = forwardLexical.Count - 1; x &gt;= 0; x--)
                    fulfil_ForwardLexical += forwardLexical[x] + "|";

                for (int x = reverseLexical.Count - 1; x &gt;= 0; x--)
                    fulfil_ReverseLexical += reverseLexical[x] + "|";

                break;
            }

            yield return new WaitForSeconds(0.5f);
        }
    }

    // 反向長詞詞位標籤
    IEnumerator reverseLable()
    {
        int x = 0;

        while (true)
        {
            try
            {
                if (x &lt; longeWordReverse_Message.Count)
                {
                    reverseLexical.Add(SQL.SQL.INQUIRE_Lexical(dataBasetitle, longeWordReverse_Message[x]));
                    x++;
                }
            }
            catch
            {
                break;
            }

            yield return new WaitForSeconds(0.5f);
        }
    }

    // 正向長詞詞位標籤
    IEnumerator forwardLable()
    {
        int x = 0;
        while (true)
        {
            try
            {

                if (x &lt; forward_exclufr.Count)
                {
                    forwardLexical.Add(SQL.SQL.INQUIRE_Lexical(dataBasetitle, forward_exclufr[x]));
                    x++;
                }

            }
            catch
            {
                break;
            }
            yield return new WaitForSeconds(0.5f);
        }
    }

    // 正向長詞優先法
    IEnumerator loneWordForward(int lenght)
    {
        while (true)
        {
            foreach (Match match1 in segmentationSymbol.brokenSatus(logeWordForward, logeWordForward.Length))
            {
                SQL.SQL.INQUIRE_TABLE(dataBasetitle, match1.ToString());

                foreach (string sqlMessage in SQL.SQL.MESSAGE)
                {
                    if (sqlMessage == match1.ToString())
                    {
                        longeWordForward_Message.Add(match1.ToString());

                        logeWordForward = segmentationSymbol.delectWord(logeWordForward, messageNumber2);
                    }

                }

                SQL.SQL.MYSQL_MESSAGE_CLERA();
            }



            foreach (Match match in segmentationSymbol.brokenSatus(logeWordForward, messageNumber2))
            {
                SQL.SQL.INQUIRE_TABLE(dataBasetitle, match.ToString());
                foreach (string str in SQL.SQL.MESSAGE)
                {
                    if (str == match.ToString())
                        longeWordForward_Message.Add(str);
                    logeWordForward = inspection(logeWordForward, longeWordForward_Message);
                    messageNumber2 = logeWordForward.Length;

                }

                SQL.SQL.MYSQL_MESSAGE_CLERA();
            }

            if (messageNumber2 &gt;= 0)
            {
                messageNumber2 -= 1;
            }

            yield return new WaitForSeconds(0.5f);
        }
    }

    // 後向長詞優先法
    IEnumerator longWordReverse(int lenght)
    {
        while (true)
        {
            foreach (Match match in segmentationSymbol.brokenSatus(logeWordReverse, logeWordReverse.Length))
            {
                SQL.SQL.INQUIRE_TABLE(dataBasetitle, match.ToString());
                foreach (string str in SQL.SQL.MESSAGE)
                {
                    if (str == match.ToString())
                    {
                        longeWordReverse_Message.Add(match.ToString());
                        logeWordReverse = segmentationSymbol.delectWord(logeWordReverse, match.Length);
                    }
                }
            }

            SQL.SQL.INQUIRE_TABLE(dataBasetitle, logeWordReverse);

            foreach (string str in segmentationSymbol.delect_Word_Combination(logeWordReverse, lenght))
            {
                SQL.SQL.INQUIRE_TABLE(dataBasetitle, str);

                foreach (string sqlMessage in SQL.SQL.MESSAGE)
                {
                    if (sqlMessage == str)
                    {
                        longeWordReverse_Message.Add(sqlMessage);
                        logeWordReverse = segmentationSymbol.delectWord(logeWordReverse, logeWordReverse.Length - sqlMessage.Length);
                        lenght = 0;
                    }
                }
            }

            lenght += 1;

            foreach (string str22 in SQL.SQL.MESSAGE)
            {
                if (str22 == logeWordReverse)
                {
                    longeWordReverse_Message.Add(logeWordReverse);
                    logeWordReverse = segmentationSymbol.delectWord(logeWordReverse, logeWordReverse.Length - str22.Length);
                }
            }


            if (lenght &gt;= combination_string.Length)
                lenght = 0;

            yield return new WaitForSeconds(0.5f);
        }
    }

    // 正向長詞優先法校正
    IEnumerator ForwardAnalysis(int lenght)
    {
        while (true)
        {

            foreach (string str in segmentationSymbol.delect_Word_Combination(combination_string, lenght))
            {
                Debug.Log(str);
                for (int x = 0; x &lt; longeWordForward_Message.Count; x++)
                {
                    if (str == longeWordForward_Message[x])
                    {
                        forward_exclufr.Add(str);
                        combination_string = segmentationSymbol.delectWord(combination_string, combination_string.Length - longeWordForward_Message[x].Length);
                        longeWordForward_Message.Remove(str);
                        lenght = 0;
                    }
                }
            }
            lenght += 1;

            for (int x = 0; x &lt; longeWordForward_Message.Count; x++)
            {
                if (combination_string == longeWordForward_Message[x])
                {
                    forward_exclufr.Add(combination_string);
                    combination_string = inspection(combination_string, forward_exclufr);
                    longeWordForward_Message.Remove(str);
                    lenght = 0;
                }
            }

            if (lenght &gt;= combination_string.Length)
                lenght = 0;

            yield return new WaitForSeconds(0.5f);
        }
    }

    // 檢查沒有數據
    private string inspection(string message, List<string> list)
    {
        string str = message;

        foreach (string str1 in list)
        {
            str = str.Replace(str1, "");
        }

        return str;
    }

    // list 清除
    private void listClear()
    {
        longeWordForward_Message.Clear();
        longeWordReverse_Message.Clear();
        forward_exclufr.Clear();
    }
}



------------結果圖------------



沒有留言:

張貼留言