Alan Liu commited on
Commit
989cd20
·
1 Parent(s): 3698d0a
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -132,7 +132,7 @@ with col2:
132
  with col3: # Prefilling
133
  prefilling_operation_count = prefilling_operation(model_config, inference_config)
134
  inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*10**12)
135
- inference_info['inference_prefilling_throughput'] = inference_config['input_seq_length']/inference_info['inference_prefilling_time']
136
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))
137
 
138
  operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
@@ -157,8 +157,8 @@ with col3: # Prefilling
157
  with col4: # Prefilling
158
  generation_operation_count = generation_operation(model_config, inference_config)
159
  inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*10**12)
160
- inference_info['inference_generation_throughput'] = inference_config['output_seq_length']/inference_info['inference_generation_time']
161
- inference_info['inference_client_generation_throughput'] = inference_config['output_seq_length'] / (inference_info['inference_prefilling_time'] + inference_info['inference_generation_time'])
162
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))
163
 
164
  operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
 
132
  with col3: # Prefilling
133
  prefilling_operation_count = prefilling_operation(model_config, inference_config)
134
  inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*10**12)
135
+ inference_info['inference_prefilling_throughput'] = inference_config['input_seq_length']*inference_config['batchsize']/inference_info['inference_prefilling_time']
136
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))
137
 
138
  operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
 
157
  with col4: # Prefilling
158
  generation_operation_count = generation_operation(model_config, inference_config)
159
  inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*10**12)
160
+ inference_info['inference_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize']/inference_info['inference_generation_time']
161
+ inference_info['inference_client_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize'] / (inference_info['inference_prefilling_time'] + inference_info['inference_generation_time'])
162
  cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))
163
 
164
  operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}