![[ Computer Vision (Read API) ] AI-OCRでFAX送信された帳票をCSV化してみました](https://devio2023-media.developers.io/wp-content/uploads/2023/07/icon.jpg)
[ Computer Vision (Read API) ] AI-OCRでFAX送信された帳票をCSV化してみました
1 はじめに
CX 事業本部 delivery部の平内(SIN)です。
一昔前まで、OCRによるテキスト化は、誤変換が多くて、なかなか実用が難しいというイメージがあったのですが、最近のAI-OCRは、日本語や手書きのものも結構な精度で読み取れるようになっています。 そして、モデルは、どんどん更新されているので、今後、ますます、精度は上がっていくでしょう。
2 歪みの修正
- グレースケール変換
- エッジ抽出
- 膨張処理
- 最大矩形検出
- 射影変換
gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY)
edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) dilate_image = cv2.dilate(edges_image, kernel)
contours, hierarchy = cv2.findContours( dilate_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE )
M = cv2.getPerspectiveTransform(pts2, pts1) img = cv2.warpPerspective( org_img, M, (w2 + 100, h2 + 100), borderValue=(255, 255, 255) )
import cv2 import os import numpy as np import cv2 class ImageTool: def __init__(self, base_name): self.dir = os.path.dirname(os.path.abspath(__file__)) self.base_name = base_name def read(self): return cv2.imread("{}/{}.png".format(self.dir, self.base_name)) def write(self, prefix, img): cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img) def main(): base_name = "fax" image_tool = ImageTool(base_name) org_img = image_tool.read() # グレースケール変換 gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY) image_tool.write("glay", gray_image) # エッジ抽出 edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3) image_tool.write("edges", edges_image) # 膨張処理 kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) dilate_image = cv2.dilate(edges_image, kernel) image_tool.write("dilate", dilate_image) # 矩形検出 contours, hierarchy = cv2.findContours( dilate_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE ) # 最大矩形を取得 max_rect = 0 max_area = 0 for cnt, _ in zip(contours, hierarchy[0]): area = cv2.contourArea(cnt) if max_area < area: max_area = area max_rect = cv2.minAreaRect(cnt) rect_point = cv2.boxPoints(max_rect).astype(int) rect_image = org_img.copy() cv2.drawContours(rect_image, [rect_point], 0, (0, 0, 255), 5) image_tool.write("rect", rect_image) # 射影変換 ((x1, y1), (x2, y2), (x3, y3), (x4, y4)) = rect_point margin = 100 x1 -= margin x2 -= margin x3 += margin x4 += margin y1 += margin * 2 y2 -= margin y3 -= margin y4 += margin * 2 pts2 = [(x2, y2), (x1, y1), (x4, y4), (x3, y3)] w2 = max(pts2, key=lambda x: x[0])[0] h2 = max(pts2, key=lambda x: x[1])[1] h, w, _ = org_img.shape pts1 = np.float32([(0, 0), (0, h), (w, h), (w, 0)]) pts2 = np.float32(pts2) M = cv2.getPerspectiveTransform(pts2, pts1) img = cv2.warpPerspective( org_img, M, (w2 + 100, h2 + 100), borderValue=(255, 255, 255) ) image_tool.write("output", img) if __name__ == "__main__": main()
3 帳票の検出
- グレースケール変換
- エッジ抽出
- 膨張処理
- 最大矩形検出
- 射影変換
グレースケール変換、エッジ抽出、膨張処理で得られた画像から、cv2.RETR_TREE で、すべての輪郭を抽出します。
contours, hierarchy = cv2.findContours( dilate_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE )
# 面積でフィルタリング rects = [] for cnt, hrchy in zip(contours, hierarchy[0]): if cv2.contourArea(cnt) < 3000: continue # 面積が一定の大きさを満たさないものを除く if cv2.contourArea(cnt) > 20000: continue # 面積が一定の大きさを超えるものを除く if hrchy[3] == -1: continue # ルートノードは除く # 輪郭を囲む長方形を計算する。 rect = cv2.minAreaRect(cnt) rect_points = cv2.boxPoints(rect).astype(int) rects.append(rect_points)
import cv2 import os import numpy as np import cv2 class ImageTool: def __init__(self, base_name): self.dir = os.path.dirname(os.path.abspath(__file__)) self.base_name = base_name def read(self): return cv2.imread("{}/{}.png".format(self.dir, self.base_name)) def write(self, prefix, img): cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img) # 矩形描画 def disp_rects(rects, img, thickness): image = img.copy() for i, rect in enumerate(rects): color = np.random.randint(0, 255, 3).tolist() cv2.drawContours(image, rects, i, color, thickness) return image def create_white_image(org_img): h, w, c = org_img.shape black_img = np.zeros((h, w, c), np.uint8) return black_img + 255 def main(): base_name = "fax" image_tool = ImageTool(base_name) org_img = image_tool.read() white_img = create_white_image(org_img) # グレースケール変換 gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY) # エッジ抽出 edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3) # 膨張処理 kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) dilate_image = cv2.dilate(edges_image, kernel) # 輪郭抽出 contours, hierarchy = cv2.findContours( dilate_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE ) # 面積でフィルタリング rects = [] for cnt, hrchy in zip(contours, hierarchy[0]): if cv2.contourArea(cnt) < 3000: continue # 面積が一定の大きさを満たさないものを除く if cv2.contourArea(cnt) > 20000: continue # 面積が一定の大きさを超えるものを除く if hrchy[3] == -1: continue # ルートノードは除く # 輪郭を囲む長方形を計算する。 rect = cv2.minAreaRect(cnt) rect_points = cv2.boxPoints(rect).astype(int) rects.append(rect_points) thickness = 3 image_tool.write("output1", disp_rects(rects, org_img, thickness)) image_tool.write("output2", disp_rects(rects, white_img, thickness)) if __name__ == "__main__": main()
4 帳票の座標検出
# 近似座標の集約 def consolidation(list): result = [] min = 0 counter = 0 for val in list: if min == 0: min = val keep = val else: if keep + 3 < val: # 3ドット以内をまとめる if counter > 2: # 得意な検出は排除する result.append(int(min + (keep - min) / 2)) min = val counter = 0 counter += 1 keep = val if counter > 2: # 特異な検出は排除する result.append(int(min + (keep - min) / 2)) return result # 座標検出 def detect_point(rects): # 全X,Y検出 x_list = [] y_list = [] for i, rect in enumerate(rects): for i in range(4): x, y = rect[i] if not x in x_list: x_list.append(x) if not y in y_list: y_list.append(y) x_list.sort() y_list.sort() # 近似値の集約 x_list = consolidation(x_list) y_list = consolidation(y_list) return x_list, y_list
import cv2 import os import numpy as np import cv2 class ImageTool: def __init__(self, base_name): self.dir = os.path.dirname(os.path.abspath(__file__)) self.base_name = base_name def read(self): return cv2.imread("{}/{}.png".format(self.dir, self.base_name)) def write(self, prefix, img): cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img) # 矩形描画 def disp_rects(rects, img, thickness): image = img.copy() for i, rect in enumerate(rects): color = np.random.randint(0, 255, 3).tolist() cv2.drawContours(image, rects, i, color, thickness) return image def create_white_image(org_img): h, w, c = org_img.shape black_img = np.zeros((h, w, c), np.uint8) return black_img + 255 # 近似座標の集約 def consolidation(list): result = [] min = 0 counter = 0 for val in list: if min == 0: min = val keep = val else: if keep + 3 < val: # 3ドット以内をまとめる if counter > 2: # 得意な検出は排除する result.append(int(min + (keep - min) / 2)) min = val counter = 0 counter += 1 keep = val if counter > 2: # 特異な検出は排除する result.append(int(min + (keep - min) / 2)) return result # 座標検出 def detect_point(rects): # 全X,Y検出 x_list = [] y_list = [] for i, rect in enumerate(rects): for i in range(4): x, y = rect[i] if not x in x_list: x_list.append(x) if not y in y_list: y_list.append(y) x_list.sort() y_list.sort() # 近似値の集約 x_list = consolidation(x_list) y_list = consolidation(y_list) return x_list, y_list # LINE描画 def disp_line(x_list, y_list, img): image = img.copy() x_min = min(x_list) x_max = max(x_list) y_min = min(y_list) y_max = max(y_list) for x in x_list: cv2.line( image, pt1=(x, y_min), pt2=(x, y_max), color=(0, 0, 255), thickness=1, lineType=cv2.LINE_4, shift=0, ) for y in y_list: cv2.line( image, pt1=(x_min, y), pt2=(x_max, y), color=(0, 0, 255), thickness=1, lineType=cv2.LINE_4, shift=0, ) return image def main(): base_name = "fax" image_tool = ImageTool(base_name) org_img = image_tool.read() white_img = create_white_image(org_img) # グレースケール変換 gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY) # エッジ抽出 edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3) # 膨張処理 kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) dilate_image = cv2.dilate(edges_image, kernel) # 輪郭抽出 contours, hierarchy = cv2.findContours( dilate_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE ) # 面積でフィルタリング rects = [] for cnt, hrchy in zip(contours, hierarchy[0]): if cv2.contourArea(cnt) < 3000: continue # 面積が一定の大きさを満たさないものを除く if cv2.contourArea(cnt) > 20000: continue # 面積が一定の大きさを超えるものを除く if hrchy[3] == -1: continue # ルートノードは除く # 輪郭を囲む長方形を計算する。 rect = cv2.minAreaRect(cnt) rect_points = cv2.boxPoints(rect).astype(int) rects.append(rect_points) # 座標検出 x_list, y_list = detect_point(rects) image_tool.write("output1", disp_line(x_list, y_list, org_img)) image_tool.write("output2", disp_line(x_list, y_list, white_img)) if __name__ == "__main__": main()
5 Computer Vision 3.2 Read API
AI-OCRとしては、MixrosoftのComputer Visionで提供されている、Read APIを使用させて頂きました。
最新のモデルは、2022-04-30となっており、日本語利用では、現時点で、このRead APIが、最も精度が高いかも知れません。
参考: Computer Vision 3.2 GA Read API を呼び出す
{ "status": "succeeded", "createdDateTime": "2023-07-02T06:47:56Z", "lastUpdatedDateTime": "2023-07-02T06:47:57Z", "analyzeResult": { "version": "3.2.0", "modelVersion": "2022-04-30", "readResults": [ { "page": 1, "angle": 0, "width": 2009, "height": 1218, "unit": "pixel", "lines": [ { "boundingBox": [ 113, 96, 246, 97, 246, 122, 113, 121 ], "text": "資材注文情報", "appearance": { "style": { "name": "other", "confidence": 0.972 } }, "words": [ { "boundingBox": [ 119, 96, 134, 97, 133, 122, 118, 122 ], "text": "資", "confidence": 0.989 }, { "boundingBox": [ 142, 97, 157, 97, 157, 122, 141, 122 ], "text": "材", "confidence": 0.965 }, ・・・略・・・
0: 資材注文情報 1: 注文ID 2: 建設会社コード 3: 建設会社名 4: 注文日時 5: 資材コード 6: 資材名 7: 数量 8: 単価 9: 合計金額 10: 注文ステータス 11: 1 12: CMP001 13: 建設A株式会社 14: 2023-06-30 12:30 15: MTL001 16: セメント 17: 100 18: 500 19: 50000 20: 処理中 ・・・略・・・
import cv2 import os import cv2 import json import requests import time class ImageTool: def __init__(self, base_name): self.dir = os.path.dirname(os.path.abspath(__file__)) self.base_name = base_name def file_name(self): return "{}/{}.png".format(self.dir, self.base_name) def read(self): return cv2.imread("{}/{}.png".format(self.dir, self.base_name)) def write(self, prefix, img): cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img) def readApi(imFilePath): with open(imFilePath, "rb") as f: data = f.read() subscription_key = "xxxxxxxxxxxxxxxxxxxxxxxxx" endpoint = "https://japaneast.api.cognitive.microsoft.com/" model_version = "2022-04-30" language = "ja" text_recognition_url = endpoint + "vision/v3.2/read/analyze" headers = { "Ocp-Apim-Subscription-Key": subscription_key, "Content-Type": "application/octet-stream", } params = {"language ": language, "model-version": model_version} response = requests.post( text_recognition_url, headers=headers, params=params, json=None, data=data ) response.raise_for_status() analysis = {} poll = True while poll: response_final = requests.get( response.headers["Operation-Location"], headers=headers ) analysis = response_final.json() print(json.dumps(analysis, indent=4, ensure_ascii=False)) time.sleep(1) if "analyzeResult" in analysis: poll = False if "status" in analysis and analysis["status"] == "failed": poll = False return analysis def getXY(x_list, y_list, boundingBox): x1 = boundingBox[0] x2 = boundingBox[4] y1 = boundingBox[1] y2 = boundingBox[5] for y in range(len(y_list) - 1): top = y_list[y] bottom = y_list[y + 1] + 1 for x in range(len(x_list) - 1): left = x_list[x] right = x_list[x + 1] + 1 if top <= y1 and y2 <= bottom and left <= x1 and x2 <= right: return x, y return -1, -1 def main(): base_name = "fax" image_tool = ImageTool(base_name) org_img = image_tool.read() # Computer Vision 3.2 Read API によるOCR読み取り response = readApi(image_tool.file_name()) # JSON ファイルを出力 with codecs.open("output_read3.2.json", "w+", "utf-8") as fp: json.dump(response, fp, ensure_ascii=False, indent=2) output_img = org_img.copy() readResult = response["analyzeResult"]["readResults"][0] lines = readResult["lines"] output = "" for i, line in enumerate(lines): text = line["text"] p = line["boundingBox"] cv2.rectangle( output_img, [p[0], p[1], (p[4] - p[0]), (p[5] - p[1])], (0, 0, 255), 1 ) cv2.putText( output_img, str(i), (p[0] - 10, p[1]), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 255), 1, cv2.LINE_AA, ) output += "{}: {}\n".format(i, text) print(output) image_tool.write("output1", output_img) if __name__ == "__main__": main()
6 CSV出力
def getXY(x_list, y_list, boundingBox): x1 = boundingBox[0] x2 = boundingBox[4] y1 = boundingBox[1] y2 = boundingBox[5] for y in range(len(y_list) - 1): top = y_list[y] bottom = y_list[y + 1] + 1 for x in range(len(x_list) - 1): left = x_list[x] right = x_list[x + 1] + 1 if top <= y1 and y2 <= bottom and left <= x1 and x2 <= right: return x, y return -1, -1
注文ID,建設会社コード,建設会社名,注文日時,資材コード,資材名,数量,単価,合計金額,注文ステータス,, 1,CMP001,建設A株式会社,2023-06-30 12:30,MTL001,セメント,100,500,50000,処理中,, ,,建設B株式会社,2023-06-30 13:00,MTL002,鉄筋,50,20,10000,出荷済み,, 3,CMP003,建設C株式会社,2023-06-30 13:30,MTL003,砂利,200,300,60000,処理中,, 4,CMP004,建設D株式会社,2023-06-30 14:00,MTL004,コンクリートブロッ,25,800,20000,キャンセル,, ,,建設E株式会社,2023-06-30 15:00-,MTL005,木材,100,1000,100000,出荷済み,, 6,CMP001,建設A株式会社,2023-06-30 15:30,MTL006,塗料,30,2000,60000,処理中,, ,,建設B株式会社,2023-06-30 16:00,MTL007,ネジ,500,10,5000,出荷済み,, 8,CMP003,建設C株式会社,2023-06-30 16:30,MTL008,釘,1000,5,5000,処理中,, ・・・略・・・
import cv2 import os import numpy as np import cv2 import json import requests import time import codecs class ImageTool: def __init__(self, base_name): self.dir = os.path.dirname(os.path.abspath(__file__)) self.base_name = base_name def read(self): return cv2.imread("{}/{}.png".format(self.dir, self.base_name)) def write(self, prefix, img): cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img) def file_name(self): return "{}/{}.png".format(self.dir, self.base_name) # 矩形描画 def disp_rects(rects, img, thickness): image = img.copy() for i, rect in enumerate(rects): color = np.random.randint(0, 255, 3).tolist() cv2.drawContours(image, rects, i, color, thickness) return image def create_white_image(org_img): h, w, c = org_img.shape black_img = np.zeros((h, w, c), np.uint8) return black_img + 255 # 近似座標の集約 def consolidation(list): result = [] min = 0 counter = 0 for val in list: if min == 0: min = val keep = val else: if keep + 3 < val: # 3ドット以内をまとめる if counter > 2: # 得意な検出は排除する result.append(int(min + (keep - min) / 2)) min = val counter = 0 counter += 1 keep = val if counter > 2: # 特異な検出は排除する result.append(int(min + (keep - min) / 2)) return result # 座標検出 def detect_point(rects): # 全X,Y検出 x_list = [] y_list = [] for i, rect in enumerate(rects): for i in range(4): x, y = rect[i] if not x in x_list: x_list.append(x) if not y in y_list: y_list.append(y) x_list.sort() y_list.sort() # 近似値の集約 x_list = consolidation(x_list) y_list = consolidation(y_list) return x_list, y_list # LINE描画 def disp_line(x_list, y_list, img): image = img.copy() x_min = min(x_list) x_max = max(x_list) y_min = min(y_list) y_max = max(y_list) for x in x_list: cv2.line( image, pt1=(x, y_min), pt2=(x, y_max), color=(0, 0, 255), thickness=1, lineType=cv2.LINE_4, shift=0, ) for y in y_list: cv2.line( image, pt1=(x_min, y), pt2=(x_max, y), color=(0, 0, 255), thickness=1, lineType=cv2.LINE_4, shift=0, ) return image def readApi(imFilePath): with open(imFilePath, "rb") as f: data = f.read() subscription_key = "xxxxxxxxxxxxxxxxxxxxxxxxx" endpoint = "https://japaneast.api.cognitive.microsoft.com/" model_version = "2022-04-30" language = "ja" text_recognition_url = endpoint + "vision/v3.2/read/analyze" headers = { "Ocp-Apim-Subscription-Key": subscription_key, "Content-Type": "application/octet-stream", } params = {"language ": language, "model-version": model_version} response = requests.post( text_recognition_url, headers=headers, params=params, json=None, data=data ) response.raise_for_status() analysis = {} poll = True while poll: response_final = requests.get( response.headers["Operation-Location"], headers=headers ) analysis = response_final.json() print(json.dumps(analysis, indent=4, ensure_ascii=False)) time.sleep(1) if "analyzeResult" in analysis: poll = False if "status" in analysis and analysis["status"] == "failed": poll = False return analysis def getXY(x_list, y_list, boundingBox): x1 = boundingBox[0] x2 = boundingBox[4] y1 = boundingBox[1] y2 = boundingBox[5] for y in range(len(y_list) - 1): top = y_list[y] bottom = y_list[y + 1] + 1 for x in range(len(x_list) - 1): left = x_list[x] right = x_list[x + 1] + 1 if top <= y1 and y2 <= bottom and left <= x1 and x2 <= right: return x, y return -1, -1 def main(): base_name = "fax" image_tool = ImageTool(base_name) org_img = image_tool.read() white_img = create_white_image(org_img) # グレースケール変換 gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY) # エッジ抽出 edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3) # 膨張処理 kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) dilate_image = cv2.dilate(edges_image, kernel) # 輪郭抽出 contours, hierarchy = cv2.findContours( dilate_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE ) # 面積でフィルタリング rects = [] for cnt, hrchy in zip(contours, hierarchy[0]): if cv2.contourArea(cnt) < 3000: continue # 面積が一定の大きさを満たさないものを除く if cv2.contourArea(cnt) > 20000: continue # 面積が一定の大きさを超えるものを除く if hrchy[3] == -1: continue # ルートノードは除く # 輪郭を囲む長方形を計算する。 rect = cv2.minAreaRect(cnt) rect_points = cv2.boxPoints(rect).astype(int) rects.append(rect_points) # 座標検出 x_list, y_list = detect_point(rects) Computer Vision 3.2 Read API によるOCR読み取り response = readApi(image_tool.file_name()) # JSON ファイルを出力 with codecs.open("output_read3.2.json", "w+", "utf-8") as fp: json.dump(response, fp, ensure_ascii=False, indent=2) # 出力用バッファ csv = [] for _ in range(len(y_list)): row = [] for _ in range(len(x_list)): row.append("") csv.append(row) # BoundingBoxを罫線位置に紹介して、CSV化する readResult = response["analyzeResult"]["readResults"][0] lines = readResult["lines"] for line in lines: text = line["text"] boundingBox = line["boundingBox"] x, y = getXY(x_list, y_list, boundingBox) if x == -1: print(">> {} {}".format(text, boundingBox)) else: print("[{},{}] {}".format(x, y, text)) csv[y][x] = text # CSV出力 lines = [] for i in range(len(csv)): row = csv[i] line = "" for col in row: line += col line += "," lines.append(line) with open("output.csv", mode="w") as f: for line in lines: f.write(line) f.write("\n") if __name__ == "__main__": main()
7 修正
また、最初から帳票の外に配置されたテキストも存在します。 このように、いくつかの状況によっては、完全な自動化は難しく、手動での補正が必要となると思います。
(1) 資材注文情報 (2) 2 CMP002 (3) 5 CMP005 (4) 7 CMP002 (5) 2023-06-30 22:30 MTL020 (6) 前回と同様の注文です。
import cv2 import os import numpy as np import cv2 import json import requests import time class ImageTool: def __init__(self, base_name): self.dir = os.path.dirname(os.path.abspath(__file__)) self.base_name = base_name def read(self): return cv2.imread("{}/{}.png".format(self.dir, self.base_name)) def write(self, prefix, img): cv2.imwrite("{}/{}_{}.png".format(self.dir, self.base_name, prefix), img) def file_name(self): return "{}/{}.png".format(self.dir, self.base_name) # 矩形描画 def disp_rects(rects, img, thickness): image = img.copy() for i, rect in enumerate(rects): color = np.random.randint(0, 255, 3).tolist() cv2.drawContours(image, rects, i, color, thickness) return image def create_white_image(org_img): h, w, c = org_img.shape black_img = np.zeros((h, w, c), np.uint8) return black_img + 255 # 近似座標の集約 def consolidation(list): result = [] min = 0 counter = 0 for val in list: if min == 0: min = val keep = val else: if keep + 3 < val: # 3ドット以内をまとめる if counter > 2: # 得意な検出は排除する result.append(int(min + (keep - min) / 2)) min = val counter = 0 counter += 1 keep = val if counter > 2: # 特異な検出は排除する result.append(int(min + (keep - min) / 2)) return result # 座標検出 def detect_point(rects): # 全X,Y検出 x_list = [] y_list = [] for i, rect in enumerate(rects): for i in range(4): x, y = rect[i] if not x in x_list: x_list.append(x) if not y in y_list: y_list.append(y) x_list.sort() y_list.sort() # 近似値の集約 x_list = consolidation(x_list) y_list = consolidation(y_list) return x_list, y_list # LINE描画 def disp_line(x_list, y_list, img): image = img.copy() x_min = min(x_list) x_max = max(x_list) y_min = min(y_list) y_max = max(y_list) for x in x_list: cv2.line( image, pt1=(x, y_min), pt2=(x, y_max), color=(0, 0, 255), thickness=1, lineType=cv2.LINE_4, shift=0, ) for y in y_list: cv2.line( image, pt1=(x_min, y), pt2=(x_max, y), color=(0, 0, 255), thickness=1, lineType=cv2.LINE_4, shift=0, ) return image def readApi(imFilePath): with open(imFilePath, "rb") as f: data = f.read() subscription_key = "xxxxxxxxxxxxxxxxxxxxxxxxx" endpoint = "https://japaneast.api.cognitive.microsoft.com/" model_version = "2022-04-30" language = "ja" text_recognition_url = endpoint + "vision/v3.2/read/analyze" headers = { "Ocp-Apim-Subscription-Key": subscription_key, "Content-Type": "application/octet-stream", } params = {"language ": language, "model-version": model_version} response = requests.post( text_recognition_url, headers=headers, params=params, json=None, data=data ) response.raise_for_status() analysis = {} poll = True while poll: response_final = requests.get( response.headers["Operation-Location"], headers=headers ) analysis = response_final.json() print(json.dumps(analysis, indent=4, ensure_ascii=False)) time.sleep(1) if "analyzeResult" in analysis: poll = False if "status" in analysis and analysis["status"] == "failed": poll = False return analysis def getXY(x_list, y_list, boundingBox): x1 = boundingBox[0] x2 = boundingBox[4] y1 = boundingBox[1] y2 = boundingBox[5] for y in range(len(y_list) - 1): top = y_list[y] bottom = y_list[y + 1] + 1 for x in range(len(x_list) - 1): left = x_list[x] right = x_list[x + 1] + 1 if top <= y1 and y2 <= bottom and left <= x1 and x2 <= right: return x, y return -1, -1 def main(): base_name = "fax" image_tool = ImageTool(base_name) org_img = image_tool.read() white_img = create_white_image(org_img) # グレースケール変換 gray_image = cv2.cvtColor(org_img, cv2.COLOR_BGR2GRAY) # エッジ抽出 edges_image = cv2.Canny(gray_image, 1, 100, apertureSize=3) # 膨張処理 kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) dilate_image = cv2.dilate(edges_image, kernel) # 輪郭抽出 contours, hierarchy = cv2.findContours( dilate_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE ) # 面積でフィルタリング rects = [] for cnt, hrchy in zip(contours, hierarchy[0]): if cv2.contourArea(cnt) < 3000: continue # 面積が一定の大きさを満たさないものを除く if cv2.contourArea(cnt) > 20000: continue # 面積が一定の大きさを超えるものを除く if hrchy[3] == -1: continue # ルートノードは除く # 輪郭を囲む長方形を計算する。 rect = cv2.minAreaRect(cnt) rect_points = cv2.boxPoints(rect).astype(int) rects.append(rect_points) # 座標検出 x_list, y_list = detect_point(rects) # Computer Vision 3.2 Read API によるOCR読み取り response = readApi(image_tool.file_name()) # JSON ファイルを出力 with codecs.open("output_read3.2.json", "w+", "utf-8") as fp: json.dump(response, fp, ensure_ascii=False, indent=2) # 出力用バッファ csv = [] for _ in range(len(y_list)): row = [] for _ in range(len(x_list)): row.append("") csv.append(row) outline_text = "" outline_counter = 1 output_img = org_img.copy() # BoundingBoxを罫線位置に紹介して、CSV化する readResult = response["analyzeResult"]["readResults"][0] lines = readResult["lines"] for line in lines: text = line["text"] boundingBox = line["boundingBox"] x, y = getXY(x_list, y_list, boundingBox) if x == -1: cv2.rectangle( output_img, [ boundingBox[0], boundingBox[1], (boundingBox[4] - boundingBox[0]), (boundingBox[5] - boundingBox[1]), ], (0, 0, 255), 2, ) cv2.putText( output_img, "({})".format(outline_counter), (boundingBox[0] - 50, boundingBox[1] + 20), cv2.FONT_HERSHEY_PLAIN, 2, (0, 0, 255), 2, cv2.LINE_AA, ) outline_text += "({}) {}\n".format(outline_counter, text) outline_counter += 1 else: csv[y][x] = text # 帳票外の出力 image_tool.write("output1", output_img) print(outline_text) # CSV出力 lines = [] for i in range(len(csv)): row = csv[i] line = "" for col in row: line += col line += "," lines.append(line) with open("output.csv", mode="w") as f: for line in lines: f.write(line) f.write("\n") if __name__ == "__main__": main()
9 最後に