标签:encoding app 处理 dir os.path ima objects ict span
json解析练习,python图像处理练习,表格包围框毛刺问题待解决。
1 # -*- coding: utf-8 -*- 2 # coding: utf-8 3 from PIL import Image, ImageDraw 4 import cv2 5 import os 6 import csv 7 import json 8 color = { 9 0: [255, 0, 0], 10 1: [0, 255, 0], 11 2: [0, 0, 255] 12 } 13 from PIL import Image 14 import numpy as np 15 16 # 取批量点取RGB众数,来推算背景色值 17 def publicnum(num, d=0): 18 dictnum = {} 19 for i in range(len(num)): 20 if str(num[i]) in dictnum.keys(): 21 dictnum[str(num[i])] += 1 22 else: 23 dictnum.setdefault(str(num[i]), 1) 24 maxnum = 0 25 maxkey = ‘[255 255 255]‘ 26 for k, v in dictnum.items(): 27 if v >= maxnum: 28 maxnum = v 29 maxkey = k 30 return maxkey 31 32 page_img_dir = "JPG" 33 output_dir = "年报_PDF_TABLE_JPG_eliminate_lines-5-15-final" 34 35 isExists = os.path.exists(output_dir) 36 if not isExists: 37 os.makedirs(output_dir) 38 csv_file = csv.reader(open(‘表格结构标注-带线年报_UTF-8.csv‘, ‘r‘, encoding="gbk")) 39 data_list = [] 40 for data in csv_file: 41 data_list.append(data) 42 43 print("page number: ", len(data_list) - 1) 44 45 for data in data_list[1:]: # 跳过第一行 46 img_path = data[0] 47 img_name = img_path.split(‘/‘)[-1] # /分割后最后一个为名字 48 pdf_name = img_name.split(‘_‘)[0] # -分割后 第一个是名字 49 local_img_path = os.path.join(page_img_dir, img_name) # 拼接路径 50 print(local_img_path) 51 annotation = json.loads(data[2]) # json单元格读取 52 objects = annotation[‘objects‘] # object是一个列表,读取该列表 53 cnt = 0 54 tu = Image.open(local_img_path) 55 page_img = np.array(tu) 56 for page_object in objects: 57 if ‘cur‘ in page_object.keys(): 58 cur = page_object[‘cur‘] 59 else: 60 cur = cnt 61 polygon = page_object[‘polygon‘][‘ptList‘] 62 x_list = [p[‘x‘] for p in polygon] 63 y_list = [p[‘y‘] for p in polygon] 64 x_min = min(x_list) 65 x_max = max(x_list) 66 y_min = min(y_list) 67 y_max = max(y_list) 68 if abs(x_max - x_min) < 20: # 纵向线条 69 xx = int((x_min + x_max) / 2) 70 inline_y_list = [y_max+20, y_min] 71 #寻找相交横线分割点 72 for in_page_object in objects: 73 in_polygon = in_page_object[‘polygon‘][‘ptList‘] 74 in_x_list = [in_p[‘x‘] for in_p in in_polygon] 75 in_y_list = [in_p[‘y‘] for in_p in in_polygon] 76 in_x_min = min(in_x_list) 77 in_x_max = max(in_x_list) 78 in_y_min = min(in_y_list) 79 in_y_max = max(in_y_list) 80 if in_y_max - in_y_min < 20: # 判断为横线 81 if in_x_max+5 >= xx and in_x_min-5 <= xx: # 判断相交 82 point_y = in_y_min 83 inline_y_list.append(point_y) 84 if 0<abs(y_max-point_y)< 10: 85 try: 86 inline_y_list.remove(max(y_max+20, point_y)) 87 inline_y_list.append(min(y_max+20, point_y)) 88 except: 89 pass 90 elif 0 < abs(y_min-point_y) < 10: 91 try: 92 inline_y_list.remove(min(y_min, point_y)) 93 inline_y_list.append(max(y_min, point_y)) 94 except: 95 pass 96 inline_y_list = list({}.fromkeys(inline_y_list).keys()) 97 inline_y_list.sort() 98 inline_y_list[-1]+=5 99 if inline_y_list[-1]>2339: 100 inline_y_list[-1]=2339 101 # 线条分割结束 102 for i in range(0, inline_y_list.__len__()): 103 if i < inline_y_list.__len__() - 1: 104 # 开始取样 105 back_colors = [] 106 for yy in range(inline_y_list[i], inline_y_list[i + 1]): 107 if xx + 8 < 1654: 108 back_colors.append(page_img[yy, xx + 8]) 109 else: 110 back_colors.append(page_img[yy, xx - 8]) 111 back_color = publicnum(back_colors) 112 back_color = back_color[1:-1] 113 try: 114 back_color = back_color.split(‘ ‘) 115 except: 116 back_color = back_color.split(‘ ‘) 117 print(type(back_color)) 118 if len(back_color) > 3: 119 back_color = list(filter(None, back_color)) 120 # 取样结束 121 # 纵向填色 122 123 for yy in range(inline_y_list[i]-4, inline_y_list[i + 1]-4): 124 if y_min-20<inline_y_list[i]<y_max+20 : 125 for ranging in range(-4, x_max-x_min+5): 126 if x_min+ranging < 1654 and x_min+ranging >= 0: 127 page_img[yy, x_min + ranging] = back_color 128 else: 129 pass 130 else: 131 pass 132 elif abs(y_max - y_min) < 20: # 横向线条 133 yy = int((y_min + y_max) / 2) 134 inline_x_list = [x_max+20, x_min] 135 # 寻找相交横线分割点 136 for in_page_object2 in objects: 137 polygon2 = in_page_object2[‘polygon‘][‘ptList‘] 138 in_x_list2 = [p[‘x‘] for p in polygon2] 139 in_y_list2 = [p[‘y‘] for p in polygon2] 140 in_x_min = min(in_x_list2) 141 in_x_max = max(in_x_list2) 142 in_y_min = min(in_y_list2) 143 in_y_max = max(in_y_list2) 144 if abs(in_x_max - in_x_min) < 20: # 判断为纵线 145 if in_y_max+5 >= y_min and in_y_min-5 <= y_max: # 判断相交 146 point_x = in_x_min 147 inline_x_list.append(point_x) 148 if 0<abs(x_max-point_x)<10: 149 try: 150 inline_x_list.remove(max(x_max+20, point_x)) 151 inline_x_list.append(min(x_max+20, point_x)) 152 except: 153 pass 154 elif 0<abs(x_min-point_x)<10: 155 try: 156 inline_x_list.remove(min(x_min, point_x)) 157 inline_x_list.append(max(x_min, point_x)) 158 except: 159 pass 160 else: 161 pass 162 inline_x_list = list({}.fromkeys(inline_x_list).keys()) 163 inline_x_list.sort() 164 #inline_x_list[-1]+=5 165 # 线条分割结束 166 for i in range(0, inline_x_list.__len__()): 167 if i < inline_x_list.__len__() - 1: 168 # 开始取样 169 back_colors = [] 170 for xx in range(inline_x_list[i], inline_x_list[i + 1]): 171 if yy+8 < 2339: 172 back_colors.append(page_img[yy + 8, xx]) 173 else: 174 back_colors.append(page_img[yy - 8, xx]) 175 back_color = publicnum(back_colors) 176 back_color = back_color[1:-1] 177 try: 178 back_color = back_color.split(‘ ‘) 179 except: 180 back_color = back_color.split(‘ ‘) 181 182 if len(back_color) > 3: 183 back_color = list(filter(None, back_color)) 184 # 取样结束 185 # 横线填色 186 for xx in range(inline_x_list[i]-4, inline_x_list[i + 1]-4): 187 if x_min-20<inline_x_list[i]<x_max+20: 188 for ranging in range(-4, y_max-y_min+5): 189 if y_min+ranging < 2339 and y_min+ranging >= 0: 190 page_img[y_min+ranging, xx] = back_color 191 elif y_min+ranging>=2339: 192 page_img[2338, xx] = back_color 193 else: 194 page_img[0, xx] = back_color 195 else: 196 pass 197 else: 198 print("no such line", ‘x_min:‘, x_min,‘x_max:‘, x_max, ‘y_max:‘, y_max, ‘y_min:‘, y_min) 199 tu = Image.fromarray(page_img.astype(‘uint8‘)) 200 output_path = os.path.join(output_dir, img_name.split(‘.‘)[0] + ‘_‘ + str(cur) + ".jpg") 201 tu.save(output_path) 202 cv2.imwrite(output_path, page_img) 203 cnt += 1
标签:encoding app 处理 dir os.path ima objects ict span
原文地址:https://www.cnblogs.com/wind-chaser/p/10868935.html