owned this note
owned this note
Published
Linked with GitHub
# 英文版chatbot
###### tags: `AI` `筆記`
# 英文版本
## 訓練方式
https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks
## 初次安裝
DockerFile
```
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y python3.6 && apt-get install -y python3-pip
RUN pip3 install tensorflow==1.12.0
RUN pip3 install bert-serving-server
RUN pip3 install bert-serving-client
RUN pip3 install flask
RUN pip3 install scipy
```
```
docker build -t chatbot_en . --no-cache
```
```
docker run --name chatBotAPP_eng \
-e 'LC_ALL=C.UTF-8' \
-e 'LANG=C.UTF-8' \
-e 'FLASK_APP=app.py' \
-e 'FLASK_ENV=development' \
-it \
-v /Users/davidyang/code/chatBotApp_eng:/storage \
-d \
-p 5000:5000 \
chatbot_en
```
```
ln -s /usr/bin/python3.6 /usr/bin/python
PTHNAME="/storage/uncased_L-12_H-768_A-12"
```
# 啟動server
```
bert-serving-start -model_dir ${PTHNAME} -num_worker=1
```
# flask
```
import subprocess
Execute = "bert-serving-start -model_dir /storage/uncased_L-12_H-768_A-12 -num_worker=1"
p = subprocess.Popen(Execute, shell=True, stdout=subprocess.PIPE)
from bert_serving.client import BertClient
from flask import Flask, escape, request,render_template, Response
from scipy import spatial
from numpy import argmax
app = Flask(__name__)
app.config['SESSION_TYPE'] = 'filesystem'
bc = BertClient(ip='localhost')
classes = {}
classesEnc = {}
@app.route("/setQA", methods=['POST'])
def setQA():
if request.method == 'POST':
qa = request.values.get('Question')
searchKey = request.values.get('searchKey')
classesTmp = json.loads(qa)
classesTmp = [cc.convert(s) for s in classesTmp ]
classesEncTmp = bc.encode(classesTmp)#把句子們轉換成數值向量
classes[searchKey] = classesTmp
classesEnc[searchKey] = classesEncTmp
response = app.response_class(
response=json.dumps({"qaCount":len(classesTmp),"searchKey":searchKey}),
status=200,
mimetype='application/json'
)
return response
@app.route("/getQA",methods=["POST"])
def getQA():
if request.method == 'POST':
searchKey = request.values.get('searchKey')
q = cc.convert(request.values.get('Question'))
qbc = bc.encode([q])
classes_encTmp = classesEnc[searchKey]
classesTmp = classes[searchKey]
ans = predict_label(classes_encTmp,qbc[0])
anstable = list(zip(ans,classesTmp))
anstable.sort(reverse = True)
anstable = anstable[0:5]
data = []
for p,s in anstable:
data.append({"Probability":p,"Question":ccback.convert(s)})
datadic = {"data":data,"searchKey":searchKey}
return app.response_class(
response=json.dumps(datadic),
status=200,
mimetype='application/json'
)
@app.route("/getQAByData",methods=["POST"])
def getQAByData():
if request.method == 'POST':
qa = request.values.get('QuestionList')
classesTmp = json.loads(qa)
classesTmp = [cc.convert(s) for s in classesTmp ]
classesEncTmp = bc.encode(classesTmp)#把句子們轉換成數值向量
q = cc.convert(request.values.get('Question'))
qbc = bc.encode([q])
ans = predict_label(classesEncTmp,qbc[0])
anstable = list(zip(ans,classesTmp))
anstable.sort(reverse = True)
anstable = anstable[0:5]
data = []
for p,s in anstable:
data.append({"Probability":p,"Question":ccback.convert(s)})
datadic = {"data":data}
return app.response_class(
response=json.dumps(datadic),
status=200,
mimetype='application/json'
)
def predict_label(classes_enc,v):
cos_sim = []
for i,c in enumerate(classes_enc):
cos_sim.append(cosine_sim(v,c))
return cos_sim
def cosine_sim(v1,v2):
return 1 - spatial.distance.cosine(v1,v2)
```
# 執行
```
flask run --reload --debugger --host 0.0.0.0
```
# 修改
```
FROM ubuntu:18.04
WORKDIR /app
ADD . /app
RUN apt-get update && apt-get install -y python3.6 && apt-get install -y python3-pip
RUN pip3 install tensorflow==1.12.0
RUN pip3 install bert-serving-server
RUN pip3 install bert-serving-client
RUN pip3 install flask
RUN pip3 install scipy
CMD export LC_ALL=C.UTF-8 && \
export LANG=C.UTF-8 && \
export FLASK_APP=app.py && \
flask run --no-reload --no-debugger --host 0.0.0.0
```
# 編譯
```
docker image build -t chotbot_en_v21 .
```
# 執行
```
docker run -d -p 6000:5000 --name mydocker123 raidavid/chotbot_en_v21
```
# 上傳
```
docker tag chotbot_en_v21 raidavid/chotbot_en_v21
docker push raidavid/chotbot_en_v21
```
# 使用
```
sudo docker run -d -p 6000:5000 --name mydocker1 --restart=always raidavid/chotbot_en_v21
```
# 停止
```
sudo docker stop mydocker1 && sudo docker rm mydocker1 && sudo docker run -d -p 6000:5000 --name mydocker1 --restart=always raidavid/chotbot_en_v21
```
# bella過渡版本
## 目標
- [ ] api判斷年齡性別(傳入臉部)
- [ ] 上傳候選句子組轉為向量 (setQA)
- [ ] 傳入文字返回排名
- [ ] 傳入候選句子+判斷句子返回最高可能性之句子
## 參考內容:https://gist.github.com/purelyvivid/567c78e72718b643fb59a4c7befd55fb
## start
```
cd /root/AgeGenderDetectAndBertServer
flask run --host=0.0.0.0
bert-serving-start -model_dir /root/AgeGenderDetectAndBertServer/bert/chinese_L-12_H-768_A-12/ -num_worker=1
```
## 中英句子判斷
下載bert的中文模型:[[Bert-Base, Chinese]] (https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)
繁體簡體轉換
pip install opencc-python-reimplemented
pip install numpy
啟動server
```
bert-serving-start -model_dir /root/AgeGenderDetectAndBertServer/bert/chinese_L-12_H-768_A-12/ -num_worker=1
```
測試
```
from flask import Flask, escape, request,render_template, Response
from opencc import OpenCC
from bert_serving.client import BertClient
from scipy import spatial
from numpy import argmax
# bert-serving-start -model_dir /Users/davidyang/work/code/AgeGenderDetect/bert/chinese_L-12_H-768_A-12/ -num_worker=1
cc = OpenCC('timport subprocess
Execute = "bert-serving-start -model_dir /storage/uncased_L-12_H-768_A-12 -num_worker=1"
p = subprocess.Popen(Execute, shell=True, stdout=subprocess.PIPE)
from bert_serving.client import BertClient
bc = BertClient()
from flask import Flask, escape, request,render_template, Response2s') # 繁轉簡
bc = BertClient(ip='localhost')
def cosine_sim(v1,v2):
return 1 - spatial.distance.cosine(v1,v2)
classes = ["我想請假","如何借會議室","我的電腦壞了"]
sents_n_labels = [
("要如何請假", 0),
("有哪些假可以請", 0),
("哪間會議室是空的",1) ,
("會議室哪一間現在可以用",1),
("電腦打不開",2),
("筆電速度很慢",2),
]
sents = [cc.convert(t[0]) for t in sents_n_labels ]
labels = [t[1] for t in sents_n_labels ]
classes = [cc.convert(s) for s in classes ]
classes_enc = bc.encode(classes)#把句子們轉換成數值向量
def predict_label(v):
cos_sim = []
for i,c in enumerate(classes_enc):
cos_sim.append(cosine_sim(v,c))
return argmax(cos_sim)
def predict_labels(vs):
op = []
for v in vs:
op.append(predict_label(v))
return op
print("True Label:", labels)
print("Predict Label:", predict_labels(bc.encode(sents)) )
```
```
from flask import Flask, escape, request,render_template, Response, redirect, url_for,jsonify
from flask_admin import Admin, BaseView, expose,AdminIndexView
from opencc import OpenCC
from bert_serving.client import BertClient
from scipy import spatial
from numpy import argmax
import json
import os
import time
from werkzeug.utils import secure_filename
from openvinoModel.openvino import PeopleData
from strUtil import Pic_str
import numpy as np
from RaiDlib import RaiDlib
from flask_admin.contrib.fileadmin import FileAdmin
import datetime
import random
import cv2
# bert-serving-start -model_dir /Users/davidyang/work/code/AgeGenderDetect/bert/chinese_L-12_H-768_A-12/ -num_worker=1
openvinoModel = PeopleData()
app = Flask(__name__)
app.config["DEBUG"] = True
UPLOAD_FOLDER = './temp'
ALLOWED_EXTENSIONS = set(['png', 'jpg', 'jpeg'])
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB
app.secret_key = 'super secret key'
app.config['SESSION_TYPE'] = 'filesystem'
# admin = Admin(app, name='RAIAiSystem', index_view=AdminIndexView(name='串流',template='welcome.html',url='/admin'), template_mode='bootstrap3')
# admin.add_view(FileAdmin(os.path.join(os.path.dirname(__file__), '.', 'RaiAITool/photos'), name='RaiAiFile'))
admin = Admin()
admin.add_view(FileAdmin(os.path.join(os.path.dirname(__file__), '.', 'RaiAITool/photos'), name='RaiAiFile'))
admin.init_app(app)
cc = OpenCC('t2s') # 繁轉簡
ccback = OpenCC('s2t') # 簡轉繁
bc = BertClient(ip='localhost')
# classes = ["我想請假","如何借會議室","我的電腦壞了"]
classes = {}
classesEnc = {}
raiDlib = RaiDlib()
@app.route('/', methods=['GET'])
def home():
return "<h1>Hello Flask!</h1>"
@app.route("/setQA", methods=['POST'])
def setQA():
if request.method == 'POST':
qa = request.values.get('Question')
searchKey = request.values.get('searchKey')
classesTmp = json.loads(qa)
classesTmp = [cc.convert(s) for s in classesTmp ]
classesEncTmp = bc.encode(classesTmp)#把句子們轉換成數值向量
classes[searchKey] = classesTmp
classesEnc[searchKey] = classesEncTmp
response = app.response_class(
response=json.dumps({"qaCount":len(classesTmp),"searchKey":searchKey}),
status=200,
mimetype='application/json'
)
return response
@app.route("/getQA",methods=["POST"])
def getQA():
if request.method == 'POST':
searchKey = request.values.get('searchKey')
q = cc.convert(request.values.get('Question'))
qbc = bc.encode([q])
classes_encTmp = classesEnc[searchKey]
classesTmp = classes[searchKey]
ans = predict_label(classes_encTmp,qbc[0])
anstable = list(zip(ans,classesTmp))
anstable.sort(reverse = True)
anstable = anstable[0:5]
data = []
for p,s in anstable:
data.append({"Probability":p,"Question":ccback.convert(s)})
datadic = {"data":data,"searchKey":searchKey}
return app.response_class(
response=json.dumps(datadic),
status=200,
mimetype='application/json'
)
@app.route("/getQAByData",methods=["POST"])
def getQAByData():
if request.method == 'POST':
qa = request.values.get('QuestionList')
classesTmp = json.loads(qa)
classesTmp = [cc.convert(s) for s in classesTmp ]
classesEncTmp = bc.encode(classesTmp)#把句子們轉換成數值向量
q = cc.convert(request.values.get('Question'))
qbc = bc.encode([q])
ans = predict_label(classesEncTmp,qbc[0])
anstable = list(zip(ans,classesTmp))
anstable.sort(reverse = True)
anstable = anstable[0:5]
data = []
for p,s in anstable:
data.append({"Probability":p,"Question":ccback.convert(s)})
datadic = {"data":data}
return app.response_class(
response=json.dumps(datadic),
status=200,
mimetype='application/json'
)
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
@app.route('/faceDetect', methods=['POST'])
def upload_file():
datadic = {}
if request.method == 'POST':
searchKey = request.values.get('searchKey')
file = request.files['file']
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
path = os.path.join(app.config['UPLOAD_FOLDER'],
searchKey +"_"+ str(int(time.time()*1000)) + "_"+filename)
file.save(path)
age,gender = openvinoModel.AgeGenderDetectByPath(path)
datadic["data"] = {"age":age,"gender":gender}
else:
datadic["error"] = "file name error"
return app.response_class(
response=json.dumps(datadic),
status=200,
mimetype='application/json'
)
return '''
<!doctype html>
<title>Upload new File</title>
<h1>Upload new File</h1>
<form action="" method=post enctype=multipart/form-data>
<p><input type=file name=file>
<input type=submit value=Upload>
</form>
'''
def predict_label(classes_enc,v):
cos_sim = []
for i,c in enumerate(classes_enc):
cos_sim.append(cosine_sim(v,c))
return cos_sim
def cosine_sim(v1,v2):
return 1 - spatial.distance.cosine(v1,v2)
sents_n_labels = [
("要如何請假", 0),
("有哪些假可以請", 0),
("哪間會議室是空的",1) ,
("會議室哪一間現在可以用",1),
("電腦打不開",2),
("筆電速度很慢",2),
]
# sents = [cc.convert(t[0]) for t in sents_n_labels ]
# labels = [t[1] for t in sents_n_labels ]
# classes = [cc.convert(s) for s in classes ]
# classes_enc = bc.encode(classes)#把句子們轉換成數值向量
@app.route('/checkFace', methods=['POST'])
def checkFace():
# pathstr = "RaiAITool/photos/checkface"
# if not os.path.exists(pathstr):
# os.makedirs(pathstr)
# f = request.files['photo']
# path = os.path.join(pathstr, "_"+ str(int(time.time()*1000)) + ".jpg")
# f.save(path)
filestr = request.files['photo'].read()
npimg = np.fromstring(filestr, np.uint8)
frame = cv2.imdecode(npimg, cv2.IMREAD_COLOR)
statusCode,data,_ = raiDlib.check_Face(frame)
return jsonify({"success": statusCode, "msg": "偵測完畢","data":data})
@app.route('/checkFaceMax', methods=['POST'])
def checkFaceMax():
pathstr = "RaiAITool/photos/checkface"
if not os.path.exists(pathstr):
os.makedirs(pathstr)
filestr = request.files['photo'].read()
npimg = np.fromstring(filestr, np.uint8)
frame = cv2.imdecode(npimg, cv2.IMREAD_COLOR)
statusCode,data,_ = raiDlib.check_Face_Max(frame)
path = os.path.join(pathstr, "_"+ str(int(time.time()*1000)) + '(' + str(data)+')'+ ".jpg")
cv2.imwrite(path, frame)
return jsonify({"success": statusCode, "msg": "偵測完畢","data":data})
@app.route('/up_photo', methods=['POST'])
def up_photo():
if not os.path.exists(raiDlib.file_dir):
os.makedirs(raiDlib.file_dir)
f = request.files['photo']
name = request.values['name']
if f and allowed_file(f.filename):
fname = secure_filename(f.filename)
ext = fname.rsplit('.', 1)[1]
new_filename = name+'_'+Pic_str().create_uuid() + '.' + ext
f.save(os.path.join(raiDlib.file_dir, new_filename))
frame = cv2.imread(os.path.join(raiDlib.file_dir, new_filename))
statusCode,data = raiDlib.up_photo_face(name,new_filename,frame)
return jsonify({"success": statusCode, "msg": data})
else:
return jsonify({"error": 1001, "msg": "上傳失敗"})
@app.route('/up_photo_max', methods=['POST'])
def up_photo_max():
if not os.path.exists(raiDlib.file_dir):
os.makedirs(raiDlib.file_dir)
f = request.files['photo']
name = request.values['name']
if f and allowed_file(f.filename):
fname = secure_filename(f.filename)
ext = fname.rsplit('.', 1)[1]
new_filename = name+'_MAX_'+Pic_str().create_uuid() + '.' + ext
f.save(os.path.join(raiDlib.file_dir, new_filename))
frame = cv2.imread(os.path.join(raiDlib.file_dir, new_filename))
statusCode,data = raiDlib.up_photo_face_Max(name,new_filename,frame)
return jsonify({"success": statusCode, "msg": data})
else:
return jsonify({"error": 1001, "msg": "上傳失敗"})
def create_uuid(self): #生成唯一的圖片的名稱字串,防止圖片顯示時的重名問題
nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S"); # 生成當前時間
randomNum = random.randint(0, 100); # 生成的隨機整數n,其中0<=n<=100
if randomNum <= 10:
randomNum = str(0) + str(randomNum)
uniqueNum = str(nowTime) + str(randomNum)
return uniqueNum
def predict_labels(vs):
op = []
for v in vs:
op.append(predict_label(v))
return op
# print("True Label:", labels)
# print("Predict Label:", predict_labels(bc.encode(sents)) )
if __name__ == "__main__":
```
# 資料蒐集
## 使用Bert
Bert训练MRPC数据集,将预测模型写成API调用,以及简单html界面使用,后台服务使用flask
https://blog.csdn.net/weixin_37735081/article/details/94222230
本文基于Google开源的BERT代码进行了进一步的简化,方便生成句向量与做文本分类
https://github.com/terrifyzhao/bert-utils
https://blog.csdn.net/u012526436/article/details/84637834
http://www.iequa.com/2019/04/08/nlp/BERT/
https://leemeng.tw/attack_on_bert_transfer_learning_in_nlp.html
BERT中文實戰(文本相似度)
https://www.twblogs.net/a/5c0a6535bd9eee6fb37ba12a