coral / segment_and_test.py

Upload 12 files

c7ccc16 verified 3 months ago

19.8 kB

	import os
	import subprocess
	import time
	import shutil
	import re
	import hashlib

	#'''
	fn_list = [
	'tf2_ssd_mobilenet_v2_coco17_ptq',
	'ssd_mobilenet_v2_coco_quant_postprocess',
	'ssdlite_mobiledet_coco_qat_postprocess',
	'ssd_mobilenet_v1_coco_quant_postprocess',
	'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq',
	'efficientdet_lite0_320_ptq',
	'efficientdet_lite1_384_ptq',
	'efficientdet_lite2_448_ptq',
	'efficientdet_lite3_512_ptq',
	'efficientdet_lite3x_640_ptq',
	'yolov5n-int8',
	'yolov5s-int8',
	'yolov5m-int8',
	'yolov5l-int8',

	['yolov8n_416_640px', 'yolov8n_384_640px', 'yolov8n_384_608px', 'yolov8n_352_608px'],
	['yolov8s_416_640px', 'yolov8s_384_640px', 'yolov8s_384_608px', 'yolov8s_352_608px'],
	['yolov8m_416_640px', 'yolov8m_384_640px', 'yolov8m_384_608px', 'yolov8m_352_608px'],
	['yolov8l_416_640px', 'yolov8l_384_640px', 'yolov8l_384_608px', 'yolov8l_352_608px'],

	['yolov9t_416_640px', 'yolov9t_384_640px', 'yolov9t_384_608px', 'yolov9t_352_608px', 'yolov9t_352_576px'],
	['yolov9s_416_640px', 'yolov9s_384_640px', 'yolov9s_384_608px', 'yolov9s_352_608px', 'yolov9s_352_576px'],
	['yolov9m_416_640px', 'yolov9m_384_640px', 'yolov9m_384_608px', 'yolov9m_352_608px', 'yolov9m_352_576px'],
	['yolov9c_416_640px', 'yolov9c_384_640px', 'yolov9c_384_608px', 'yolov9c_352_608px', 'yolov9c_352_576px'],

	'ipcam-general-v8'
	]

	custom_args = {
	'tf2_ssd_mobilenet_v2_coco17_ptq': {
	2: ["--diff_threshold_ns","100000"]},
	'ssd_mobilenet_v2_coco_quant_postprocess': {
	5: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs","--partition_search_step","3"]},
	'ssdlite_mobiledet_coco_qat_postprocess': {
	2: ["--diff_threshold_ns","100000"]},
	'efficientdet_lite3_512_ptq': {
	2: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"],
	3: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"],
	4: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"],
	5: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"],
	6: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"],
	7: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"]},
	'efficientdet_lite3x_640_ptq': {
	5: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs","--partition_search_step","2"],
	6: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs","--partition_search_step","3"]},
	'yolov5n-int8': {
	5: ["--partition_search_step","2"],
	6: ["--partition_search_step","2"],
	7: ["--partition_search_step","2"],
	8: ["--partition_search_step","2"]},
	'yolov5s-int8': {
	5: ["--partition_search_step","2"],
	6: ["--partition_search_step","2"],
	7: ["--partition_search_step","2"],
	8: ["--partition_search_step","2"]},
	'yolov5m-int8': {
	5: ["--partition_search_step","2"],
	6: ["--partition_search_step","2"],
	7: ["--partition_search_step","2"],
	8: ["--partition_search_step","2"]},
	'yolov5l-int8': {
	5: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs","--partition_search_step","2"],
	6: ["--partition_search_step","2"],
	7: ["--partition_search_step","2"],
	8: ["--partition_search_step","2"]},
	'yolov8m_416_640px': {
	5: ["--partition_search_step","2"],
	6: ["--partition_search_step","3"],
	7: ["--partition_search_step","4"],
	8: ["--partition_search_step","5"]},
	'yolov8l_416_640px': {
	4: ["--partition_search_step","2"],
	5: ["--partition_search_step","2"],
	6: ["--partition_search_step","3"],
	7: ["--partition_search_step","4"],
	8: ["--partition_search_step","5"]},
	'yolov9c_416_640px': {
	2: ["--delegate_search_step","10"]},
	'yolov9c_384_640px': {
	1: ["--delegate_search_step","10"],
	2: ["--delegate_search_step","10"]},
	'yolov9c_384_608px': {
	1: ["--delegate_search_step","10"],
	2: ["--delegate_search_step","10"]},
	'yolov9c_352_608px': {
	1: ["--delegate_search_step","10"],
	2: ["--delegate_search_step","10"]},
	'yolov9c_352_576px': {
	1: ["--delegate_search_step","10"],
	2: ["--delegate_search_step","10"]}}#'''

	'''
	fn_list = [
	# 'yolov5n-int8',
	# 'yolov5s-int8',
	# 'yolov5m-int8',
	# 'yolov5l-int8',
	# 'yolov8n_full_integer_quant',
	# 'yolov8s_full_integer_quant',
	# 'yolov8m_full_integer_quant',
	# 'yolov8l_full_integer_quant',
	# 'yolov8n_480px',
	# 'yolov8s_480px',
	# 'yolov8m_480px',
	# 'yolov8l_480px',
	# 'yolov8n_512px',
	# 'yolov8s_512px',
	# 'yolov8m_512px',
	# 'yolov8l_512px',
	# 'yolov8s_544px',
	# 'yolov8m_544px', # lg 1st seg
	# 'yolov8l_544px', # lg 1st seg
	# 'yolov8s_576px',
	# 'yolov8m_576px', # lg 1st seg
	# 'yolov8l_576px', # lg 1st seg
	# 'yolov8s_608px',
	# 'yolov8m_608px', # lg 1st seg
	# 'yolov8l_608px',
	# 'yolov8n_640px',
	# 'yolov8s_640px',
	# 'yolov8m_640px', # lg 1st seg
	# 'yolov8l_640px', # lg 1st seg
	# 'yolov8n_416_640px', # lg 1st seg
	'yolov8s_416_640px', # lg 1st seg
	'yolov8m_416_640px', # lg 1st seg
	'yolov8l_416_640px'] # lg 1st seg
	# 'ipcam-general-v8'] #'''

	'''
	custom_args = {
	'yolov8n_full_integer_quant': {
	2: ["--diff_threshold_ns","100000"],
	3: ["--diff_threshold_ns","200000"]},
	'yolov8s_full_integer_quant': {
	2: ["--diff_threshold_ns","200000"]},
	'yolov8l_full_integer_quant': {
	5: ["--partition_search_step","2"]},
	'yolov8n_480px': {
	2: ["--diff_threshold_ns","100000"],
	3: ["--diff_threshold_ns","200000"]},
	'yolov8s_480px': {
	2: ["--diff_threshold_ns","200000"]},
	'yolov8m_480px': {
	5: ["--partition_search_step","2"]},
	'yolov8n_512px': {
	2: ["--diff_threshold_ns","1200000"],
	3: ["--diff_threshold_ns","600000"]},
	'yolov8s_512px': {
	2: ["--diff_threshold_ns","200000"]},
	'yolov8m_640px': {
	2: ["--diff_threshold_ns","200000", "--undefok=timeout_sec","--timeout_sec=360"]},
	'yolov8l_640px': {
	2: ["--undefok=timeout_sec","--timeout_sec=360"]},
	'yolov8n_416_640px': {
	5: ["--partition_search_step","2"]},
	'yolov8s_416_640px': {
	5: ["--partition_search_step","2"]},
	'yolov8m_416_640px': {
	5: ["--initial_lower_bound_ns","44658311","--initial_upper_bound_ns","45466138","--partition_search_step","2"],
	6: ["--initial_lower_bound_ns","39444004","--initial_upper_bound_ns","40071927","--partition_search_step","3"],
	7: ["--initial_lower_bound_ns","36028652","--initial_upper_bound_ns","37012866","--partition_search_step","4"],
	8: ["--initial_lower_bound_ns","33892323","--initial_upper_bound_ns","34856571","--partition_search_step","5"]},
	'yolov8l_416_640px': {
	5: ["--initial_lower_bound_ns","82297482","--initial_upper_bound_ns","82892528","--partition_search_step","2"],
	6: ["--initial_lower_bound_ns","69966647","--initial_upper_bound_ns","70757195","--partition_search_step","3"],
	7: ["--initial_lower_bound_ns","69067450","--initial_upper_bound_ns","69599451","--partition_search_step","4"],
	8: ["--initial_lower_bound_ns","55889854","--initial_upper_bound_ns","56444625","--partition_search_step","5"]}}#'''

	'''
	diff_threshold_ns = {
	'yolov8s_416_640px': {
	2: 4000000},
	'yolov8m_416_640px': {
	4: 40000000,
	5: 30000000},
	'yolov8l_416_640px': {
	7: 90000000,
	8: 70000000}}#'''

	'''
	custom_args = {
	'yolov8m_416_640px': {
	5: ["--partition_search_step","2"],
	6: ["--partition_search_step","3"],
	7: ["--partition_search_step","4"],
	8: ["--partition_search_step","5"]},
	'yolov8l_416_640px': {
	4: ["--partition_search_step","2"],
	5: ["--partition_search_step","2"],
	6: ["--partition_search_step","3"],
	7: ["--partition_search_step","4"],
	8: ["--partition_search_step","5"]}}#'''

	seg_dir = "/home/seth/Documents/all_segments/"
	seg_types = ['', '2x_first_seg/', '15x_first_seg/', '3x_first_seg/', '4x_first_seg/', '15x_last_seg/', '2x_last_seg/', 'dumb/']


	def seg_exists(filename, segment_type, segment_count):
	if segment_type == 'orig_code':
	segment_type = ''

	if segment_count == 1:
	seg_list = [seg_dir+segment_type+filename+'_edgetpu.tflite']
	else:
	seg_list = [seg_dir+segment_type+filename+'_segment_{}_of_{}_edgetpu.tflite'.format(i, segment_count) for i in range(segment_count)]
	return (seg_list, any([True for s in seg_list if not os.path.exists(s)]))

	MAX_TPU_COUNT = 5

	'''
	# Generate segment files
	for sn in range(1,MAX_TPU_COUNT+1):
	flat_fn_list = []
	for fn in fn_list:
	if isinstance(fn, list):
	flat_fn_list += fn
	else:
	flat_fn_list.append(fn)


	for fn in flat_fn_list:
	for seg_type in seg_types:
	seg_list, file_missing = seg_exists(fn, seg_type, sn)

	if not file_missing:
	continue

	if sn == 1:
	cmd = ["/usr/bin/edgetpu_compiler","-s","-d","--out_dir",seg_dir+seg_type,seg_dir+fn+".tflite"]
	elif 'dumb' in seg_type:
	cmd = ["/usr/bin/edgetpu_compiler","-s","-d","-n",str(sn),"--out_dir",seg_dir+seg_type,seg_dir+fn+".tflite"]
	elif 'saturated' in seg_type:
	try:
	cmd = ["libcoral/out/k8/tools/partitioner/partition_with_profiling","--output_dir",seg_dir+seg_type,"--edgetpu_compiler_binary",
	"/usr/bin/edgetpu_compiler","--model_path",seg_dir+fn+".tflite","--num_segments",str(sn),
	"--diff_threshold_ns", str(diff_threshold_ns[fn][sn])]
	except:
	# Note: "Saturated segments" is an attempt to load as much of the model as possible onto segments
	# while ignoring the latency incurred by slower segments. We assume we'll be able to "speed up"
	# these slower segments simply by running more copies of them. The faster segments ideally will
	# be optimized to all run at roughly the same speed. Thus the overall inference throughput will
	# be limited by how many multiples of the slowest segment we can run.
	#
	# diff_threshold_ns key entries only exist where we want to create "saturated segments". More would
	# mean the model is too sparse across segments. We create saturated segments by adjusting the
	# diff_threshold_ns until the compiler just starts pushing parameters off of the TPUs. Ideally
	# this will result in one or two slow segments and the rest of the segments are roughly equally
	# fast.
	continue

	else:
	if '2x_first_seg' in seg_type:
	#+++ b/coral/tools/partitioner/profiling_based_partitioner.cc
	#@@ -190,6 +190,8 @@ int64_t ProfilingBasedPartitioner::PartitionCompileAndAnalyze(
	# latencies = std::get<2>(coral::BenchmarkPartitionedModel(
	# tmp_edgetpu_segment_paths, &edgetpu_contexts(), kNumInferences));
	#+ latencies[0] /= 2;
	# if (kUseCache) {
	# for (int i = 0; i < num_segments_; ++i) {
	# segment_latency_cache_[{segment_starts[i], num_ops[i]}] = latencies[i];
	#@@ -211,10 +213,11 @@ std::pair<int64_t, int64_t> ProfilingBasedPartitioner::GetBounds(
	# num_segments_, /search_delegate=/true,
	# delegate_search_step))
	# << "Can not compile initial partition.";
	#- const auto latencies = std::get<2>(coral::BenchmarkPartitionedModel(
	#+ auto latencies = std::get<2>(coral::BenchmarkPartitionedModel(
	# tmp_edgetpu_segment_paths, &edgetpu_contexts(), kNumInferences));
	#
	# DeleteFolder(tmp_dir);
	#+ latencies[0] /= 4;
	#
	# int64_t lower_bound = std::numeric_limits<int64_t>::max(), upper_bound = 0;
	# for (auto latency : latencies) {
	#
	# sudo make DOCKER_IMAGE="ubuntu:20.04" DOCKER_CPUS="k8" DOCKER_TARGETS="tools" docker-build

	#// Encourage each segment slower than the previous to spread out the bottlenecks
	#double latency_adjust = 1.0;
	#for (int i = 1; i < num_segments_; ++i)
	#{
	# if (latencies[i-1] < latencies[i])
	# latency_adjust *= 0.97;
	# latencies[i-1] *= latency_adjust;
	#}
	#latencies[num_segments_-1] *= latency_adjust;

	partition_with_profiling_dir = "libcoral/tools.2"
	elif '15x_first_seg' in seg_type:
	partition_with_profiling_dir = "libcoral/tools.15"
	elif '133x_first_seg' in seg_type:
	partition_with_profiling_dir = "libcoral/tools.133"
	elif '166x_first_seg' in seg_type:
	partition_with_profiling_dir = "libcoral/tools.166"
	elif '3x_first_seg' in seg_type:
	partition_with_profiling_dir = "libcoral/tools.3"
	elif '4x_first_seg' in seg_type:
	partition_with_profiling_dir = "libcoral/tools.4"
	elif '15x_last_seg' in seg_type:
	partition_with_profiling_dir = "libcoral/tools.last15"
	elif '2x_last_seg' in seg_type:
	partition_with_profiling_dir = "libcoral/tools.last2"
	elif '125x_last_inc_seg/' == seg_type:
	partition_with_profiling_dir = "libcoral/tools.last125_inc_seg"
	elif '2x_first_125x_last_inc_seg/' == seg_type:
	partition_with_profiling_dir = "libcoral/tools.2last125_inc_seg"
	elif 'inc_seg/' == seg_type:
	partition_with_profiling_dir = "libcoral/tools.inc_seg"
	else:
	partition_with_profiling_dir = "libcoral/tools.orig"

	cmd = [partition_with_profiling_dir+"/partitioner/partition_with_profiling","--output_dir",seg_dir+seg_type,"--edgetpu_compiler_binary",
	"/usr/bin/edgetpu_compiler","--model_path",seg_dir+fn+".tflite","--num_segments",str(sn)]

	try:
	cmd += custom_args[fn][sn]
	except:
	pass

	print(cmd)
	subprocess.run(cmd)#'''


	seg_types += ['133x_first_seg/', '166x_first_seg/', 'inc_seg/', '125x_last_inc_seg/', '2x_first_125x_last_inc_seg/']

	# Test timings
	fin_timings = {}
	fin_fnames = {}
	for fn in fn_list:
	if isinstance(fn, list):
	fn_size_list = fn
	fn = fn[0]
	else:
	fn_size_list = [fn]

	timings = []
	fin_timings[fn] = {}
	fin_fnames[fn] = {}

	for num_tpus in range(1,MAX_TPU_COUNT+1):

	for this_fn in fn_size_list:
	for seg_type in seg_types:
	max_seg = 0
	for sn in range(1,num_tpus+1):
	# No need to run many slow single TPU tests, just one
	if sn == 1 and seg_type != '':
	continue

	# Test against orig code
	exe_file = "/home/seth/CodeProject.AI-ObjectDetectionCoral/objectdetection_coral_multitpu.py"

	# Get file types
	seg_list, file_missing = seg_exists(this_fn, seg_type, sn)

	if file_missing:
	continue
	max_seg = sn

	cmd = ["python3.9",exe_file,"--model"] + \
	seg_list + ["--labels","coral/pycoral/test_data/coco_labels.txt","--input","/home/seth/coral/pycoral/test_data/grace_hopper.bmp",
	"--count","4000","--num-tpus",str(num_tpus)]
	print(cmd)

	# Clock runtime
	#start_time = time.perf_counter()
	#subprocess.run(cmd)
	#ms_time = 1000 * (time.perf_counter() - start_time) / 4000 # ms * total time / iterations

	# Last quarter runtime
	try:
	c = subprocess.run(cmd, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=3600*2)
	except subprocess.TimeoutExpired:
	print("Timed out!")
	continue
	print(c.stdout)
	print(c.stderr)
	ms_time = float(re.compile(r'threads; ([\d\.]+)ms ea').findall(c.stderr)[0])
	mpps_time = float(re.compile(r'; ([\d\.]+) tensor MPx').findall(c.stderr)[0])

	timings.append((ms_time, num_tpus, this_fn, seg_type, sn, mpps_time))
	subprocess.run(['uptime'])

	timings = sorted(timings, key=lambda t: t[5], reverse=True)
	if not any(timings):
	continue

	# Print the top ten
	print(f"TIMINGS FOR {num_tpus} TPUs AND {fn} MODEL:")
	for t in range(min(10,len(timings))):
	print(timings[t])

	# Get best segments, but
	# Skip if it's not 'orig_code' and > 1 segment
	t = [t for t in timings if t[3] != 'orig_code'][0]
	fin_timings[fn][num_tpus] = timings[0]

	# Add segment to the final list
	# Copy best to local dir
	seg_list, _ = seg_exists(t[2], t[3], t[4])
	fin_fnames[fn][num_tpus] = []
	for s in seg_list:
	file_components = os.path.normpath(s).split("/")
	out_fname = file_components[-2]+"_"+file_components[-1]
	shutil.copyfile(s, out_fname)
	checksum = hashlib.md5(open(out_fname,'rb').read()).hexdigest()
	fin_fnames[fn][num_tpus].append((out_fname, checksum))

	# Create archive for this model / TPU count
	#if len(fin_fnames[fn][num_tpus]) > 1 or num_tpus == 1:
	# zip_name = f'objectdetection-{fn}-{num_tpus}-edgetpu.zip'
	# cmd = ['zip', '-9', zip_name] + fin_fnames[fn][num_tpus]
	# print(cmd)
	# if os.path.exists(zip_name):
	# os.unlink(zip_name)
	# subprocess.run(cmd)

	print(fin_timings)
	print(fin_fnames)

	# Pretty print all of the segments we've timed and selected
	for fn, v in fin_fnames.items():
	print(" '%s': {" % fn)
	for tpu_count, timing in fin_timings[fn].items():
	if tpu_count in v:
	seg_str = f"{len(v[tpu_count])} segments"
	else:
	seg_str = "1 segment "

	fps = 1000.0 / timing[0]

	print(f"#{timing[0]:6.1f} ms/inference ({fps:5.1f} FPS;{timing[5]:5.1f} tensor MPx/sec) for {tpu_count} TPUs using {seg_str}: {timing[2]}")

	for tpu_count, out_fnames in v.items():
	if len(out_fnames) > 1:
	print(f"{tpu_count}: "+str(out_fnames)+",")
	if 1 in v:
	print(f" '_tflite': '{v[1][0]}'")
	print(" },")