Maxwell Gumley | d26e629 | 2024-04-24 10:45:07 -0600 | [diff] [blame^] | 1 | # Parse log profiling data and produce a graph showing encode times in time bins, |
| 2 | # with a breakdown per message type. |
| 3 | |
| 4 | import argparse |
| 5 | import csv |
| 6 | import math |
| 7 | import os |
| 8 | import webbrowser |
| 9 | from dataclasses import dataclass, field |
| 10 | from pathlib import Path |
| 11 | from typing import Dict, List, Optional, Tuple |
| 12 | |
| 13 | import numpy as np |
| 14 | from bokeh.io import output_notebook |
| 15 | from bokeh.models import ColumnDataSource, HoverTool, Legend, LegendItem |
| 16 | from bokeh.palettes import Category20, Viridis256 |
| 17 | from bokeh.plotting import figure, show, output_file |
| 18 | from tabulate import tabulate |
| 19 | |
| 20 | |
| 21 | @dataclass |
| 22 | class SeriesDetail: |
| 23 | event_loop_times_s: List[float] |
| 24 | encode_times_ms: List[float] |
| 25 | |
| 26 | |
| 27 | def parse_csv_file(filepath: Path, max_lines: Optional[int], |
| 28 | start_time: Optional[float], |
| 29 | end_time: Optional[float]) -> Dict[str, SeriesDetail]: |
| 30 | """Parses the CSV file to extract needed data, respecting the maximum number of lines if provided.""" |
| 31 | data_by_type: Dict[str, SeriesDetail] = {} |
| 32 | |
| 33 | with open(filepath, 'r') as file: |
| 34 | reader = csv.reader(file) |
| 35 | next(reader) # Skip the header line |
| 36 | |
| 37 | line_count = 0 |
| 38 | for line in reader: |
| 39 | if max_lines is not None and line_count >= max_lines: |
| 40 | break |
| 41 | |
| 42 | line_count += 1 |
| 43 | |
| 44 | assert len(line) > 0 |
| 45 | |
| 46 | # Note that channel_name and encoding_start_time_ns are not yet used. |
| 47 | # They may be used here in the future, but for now they are helpful when |
| 48 | # directly looking at the csv file. |
| 49 | channel_name, channel_type, encode_duration_ns, encoding_start_time_ns, message_time_s = line |
| 50 | |
| 51 | # Convert nanoseconds to milliseconds |
| 52 | encode_duration_ms = float(encode_duration_ns) * 1e-6 |
| 53 | message_time_s = float(message_time_s) |
| 54 | |
| 55 | if (start_time is not None and message_time_s < start_time): |
| 56 | continue |
| 57 | if (end_time is not None and message_time_s > end_time): |
| 58 | continue |
| 59 | |
| 60 | if channel_type in data_by_type: |
| 61 | data_by_type[channel_type].encode_times_ms.append( |
| 62 | encode_duration_ms) |
| 63 | data_by_type[channel_type].event_loop_times_s.append( |
| 64 | message_time_s) |
| 65 | else: |
| 66 | data_by_type[channel_type] = SeriesDetail( |
| 67 | encode_times_ms=[encode_duration_ms], |
| 68 | event_loop_times_s=[message_time_s]) |
| 69 | |
| 70 | return data_by_type |
| 71 | |
| 72 | |
| 73 | @dataclass |
| 74 | class DataBin: |
| 75 | bin_range: str |
| 76 | message_encode_times: Dict[str, float] |
| 77 | |
| 78 | |
| 79 | @dataclass |
| 80 | class BinnedData: |
| 81 | bins: List[DataBin] = field(default_factory=list) |
| 82 | top_type_names: List[str] = field(default_factory=list) |
| 83 | |
| 84 | |
| 85 | def create_binned_data(data_by_type: Dict[str, SeriesDetail], |
| 86 | num_bins: int = 25, |
| 87 | top_n: int = 5) -> BinnedData: |
| 88 | # Calculate total encoding times for each message type across the entire file. |
| 89 | total_encode_times: Dict[str, float] = { |
| 90 | message_type: sum(detail.encode_times_ms) |
| 91 | for message_type, detail in data_by_type.items() |
| 92 | } |
| 93 | |
| 94 | # Determine the top N message types based on total encoding times. |
| 95 | top_types: List[Tuple[str, float]] = sorted(total_encode_times.items(), |
| 96 | key=lambda item: item[1], |
| 97 | reverse=True)[:top_n] |
| 98 | print(f"{top_types=}") |
| 99 | top_type_names: List[str] = [type_name for type_name, _ in top_types] |
| 100 | |
| 101 | # Find the global minimum and maximum times to establish bin edges. |
| 102 | min_time: float = min(detail.event_loop_times_s[0] |
| 103 | for detail in data_by_type.values()) |
| 104 | max_time: float = max(detail.event_loop_times_s[-1] |
| 105 | for detail in data_by_type.values()) |
| 106 | |
| 107 | # Create bins. |
| 108 | bins = np.linspace(min_time, max_time, num_bins + 1) |
| 109 | |
| 110 | # Initialize the list of DataBin instances with the correct number of bins. |
| 111 | binned_data = BinnedData(top_type_names=top_type_names) |
| 112 | for i in range(num_bins): |
| 113 | bin_range = f"{bins[i]:.2f} - {bins[i+1]:.2f}" |
| 114 | binned_data.bins.append( |
| 115 | DataBin(bin_range=bin_range, |
| 116 | message_encode_times={ |
| 117 | name: 0 |
| 118 | for name in top_type_names + ['other'] |
| 119 | })) |
| 120 | |
| 121 | # Populate binned_data with message encode times. |
| 122 | for message_type, details in data_by_type.items(): |
| 123 | binned_indices = np.digitize(details.event_loop_times_s, bins) - 1 |
| 124 | # Correcting the bin indices that are out of range by being exactly on the maximum edge. |
| 125 | binned_indices = np.minimum(binned_indices, num_bins - 1) |
| 126 | |
| 127 | for idx, encode_time in enumerate(details.encode_times_ms): |
| 128 | bin_index = binned_indices[idx] |
| 129 | current_bin = binned_data.bins[bin_index] |
| 130 | if message_type in top_type_names: |
| 131 | current_bin.message_encode_times[message_type] += encode_time |
| 132 | else: |
| 133 | current_bin.message_encode_times['other'] += encode_time |
| 134 | |
| 135 | return binned_data |
| 136 | |
| 137 | |
| 138 | def print_binned_data(binned_data: BinnedData) -> None: |
| 139 | # Extend headers for the table by replacing '.' with '\n.' for each message type name and |
| 140 | # adding 'Total'. |
| 141 | headers = ['Bin Range'] + [ |
| 142 | key.replace('.', '\n.') |
| 143 | for key in binned_data.top_type_names + ['other'] |
| 144 | ] + ['Total'] |
| 145 | |
| 146 | # Initialize the table data list. |
| 147 | table_data = [] |
| 148 | |
| 149 | # Populate the table data with the values from each DataBin instance and calculate totals. |
| 150 | for data_bin in binned_data.bins: |
| 151 | # Initialize a row with the bin range. |
| 152 | row = [data_bin.bin_range] |
| 153 | # Add the total message encode times for each message type. |
| 154 | encode_times = [ |
| 155 | data_bin.message_encode_times[message_type] |
| 156 | for message_type in binned_data.top_type_names |
| 157 | ] |
| 158 | other_time = data_bin.message_encode_times['other'] |
| 159 | row += encode_times + [other_time] |
| 160 | # Calculate the total encode time for the row and append it. |
| 161 | total_encode_time = sum(encode_times) + other_time |
| 162 | row.append(total_encode_time) |
| 163 | # Append the row to the table data. |
| 164 | table_data.append(row) |
| 165 | |
| 166 | # Print the table using tabulate with 'grid' format for better structure. |
| 167 | print( |
| 168 | tabulate(table_data, headers=headers, tablefmt='grid', floatfmt=".2f")) |
| 169 | |
| 170 | |
| 171 | def plot_bar(binned_data: BinnedData) -> None: |
| 172 | filename = "plot.html" |
| 173 | output_file(filename, title="Message Encode Time Plot Stacked Bar Graph") |
| 174 | |
| 175 | # Adjust width based on bin count for readability. |
| 176 | plot_width = max(1200, 50 * len(binned_data.bins)) |
| 177 | |
| 178 | p = figure(x_range=[bin.bin_range for bin in binned_data.bins], |
| 179 | title='Message Encode Time by Type over Event Loop Time', |
| 180 | x_axis_label='Event Loop Time Bins', |
| 181 | y_axis_label='Total Message Encode Time (ms)', |
| 182 | width=plot_width, |
| 183 | height=600, |
| 184 | tools="") |
| 185 | |
| 186 | source_data = {'bin_edges': [bin.bin_range for bin in binned_data.bins]} |
| 187 | for message_type in binned_data.top_type_names + ['other']: |
| 188 | source_data[message_type] = [ |
| 189 | bin.message_encode_times.get(message_type, 0) |
| 190 | for bin in binned_data.bins |
| 191 | ] |
| 192 | |
| 193 | source = ColumnDataSource(data=source_data) |
| 194 | |
| 195 | # Calculate totals and sort in descending order. |
| 196 | totals = { |
| 197 | message_type: sum(source_data[message_type]) |
| 198 | for message_type in source_data if message_type != 'bin_edges' |
| 199 | } |
| 200 | sorted_message_types = sorted(totals, key=totals.get, reverse=True) |
| 201 | |
| 202 | # Reverse the list to place larger segments at the top. |
| 203 | sorted_message_types.reverse() |
| 204 | |
| 205 | num_types = len(sorted_message_types) |
| 206 | if num_types > 20: |
| 207 | raise ValueError( |
| 208 | f"Number of types ({num_types}) exceeds the number of available colors in Category20." |
| 209 | ) |
| 210 | colors = Category20[20][:num_types] |
| 211 | |
| 212 | # Apply reversed order to stack rendering. |
| 213 | renderers = p.vbar_stack(sorted_message_types, |
| 214 | x='bin_edges', |
| 215 | width=0.7, |
| 216 | color=colors, |
| 217 | source=source) |
| 218 | |
| 219 | # Change the orientation of the x-axis labels to a 45-degree angle. |
| 220 | p.xaxis.major_label_orientation = math.pi / 4 |
| 221 | |
| 222 | # Create a custom legend, maintaining the reversed order for visual consistency. |
| 223 | legend_items = [ |
| 224 | LegendItem(label=mt.replace('.', ' '), renderers=[renderers[i]]) |
| 225 | for i, mt in enumerate(sorted_message_types) |
| 226 | ] |
| 227 | legend = Legend(items=legend_items, location=(0, -30)) |
| 228 | p.add_layout(legend, 'right') |
| 229 | |
| 230 | p.y_range.start = 0 |
| 231 | p.x_range.range_padding = 0.05 |
| 232 | p.xgrid.grid_line_color = None |
| 233 | p.axis.minor_tick_line_color = None |
| 234 | p.outline_line_color = None |
| 235 | |
| 236 | file_path = os.path.realpath(filename) |
| 237 | print('\n') |
| 238 | print(f"Plot saved to '{file_path}'") |
| 239 | webbrowser.open('file://' + file_path) |
| 240 | |
| 241 | |
| 242 | def main(): |
| 243 | parser = argparse.ArgumentParser( |
| 244 | description= |
| 245 | 'Process log files to extract and plot message encode times.') |
| 246 | parser.add_argument('--log_file_path', |
| 247 | type=Path, |
| 248 | help='Path to the log file', |
| 249 | required=True) |
| 250 | parser.add_argument( |
| 251 | '--max_lines', |
| 252 | type=int, |
| 253 | default=None, |
| 254 | help='Maximum number of lines to read from the log file') |
| 255 | parser.add_argument('--num_bins', |
| 256 | type=int, |
| 257 | default=40, |
| 258 | help='Number of bins to use') |
| 259 | parser.add_argument('--top_n', |
| 260 | type=int, |
| 261 | default=10, |
| 262 | help='Number of top message types to plot') |
| 263 | parser.add_argument('--start_time', |
| 264 | type=float, |
| 265 | default=None, |
| 266 | help='Start time in seconds') |
| 267 | parser.add_argument('--end_time', |
| 268 | type=float, |
| 269 | default=None, |
| 270 | help='End time in seconds') |
| 271 | |
| 272 | args = parser.parse_args() |
| 273 | |
| 274 | data_by_type = parse_csv_file(filepath=args.log_file_path, |
| 275 | max_lines=args.max_lines, |
| 276 | start_time=args.start_time, |
| 277 | end_time=args.end_time) |
| 278 | binned_data = create_binned_data(data_by_type, |
| 279 | num_bins=args.num_bins, |
| 280 | top_n=args.top_n) |
| 281 | print_binned_data(binned_data) |
| 282 | plot_bar(binned_data) |
| 283 | print(f"{os.path.basename(__file__)} Finished.") |
| 284 | |
| 285 | |
| 286 | if __name__ == '__main__': |
| 287 | main() |