blob: 125b308e57286ee92db70a2abe4f9337df19eacd [file] [log] [blame]
Maxwell Gumleyd26e6292024-04-24 10:45:07 -06001# Parse log profiling data and produce a graph showing encode times in time bins,
2# with a breakdown per message type.
3
4import argparse
5import csv
6import math
7import os
8import webbrowser
9from dataclasses import dataclass, field
10from pathlib import Path
11from typing import Dict, List, Optional, Tuple
12
13import numpy as np
14from bokeh.io import output_notebook
15from bokeh.models import ColumnDataSource, HoverTool, Legend, LegendItem
16from bokeh.palettes import Category20, Viridis256
17from bokeh.plotting import figure, show, output_file
18from tabulate import tabulate
19
20
21@dataclass
22class SeriesDetail:
23 event_loop_times_s: List[float]
24 encode_times_ms: List[float]
25
26
27def parse_csv_file(filepath: Path, max_lines: Optional[int],
28 start_time: Optional[float],
29 end_time: Optional[float]) -> Dict[str, SeriesDetail]:
30 """Parses the CSV file to extract needed data, respecting the maximum number of lines if provided."""
31 data_by_type: Dict[str, SeriesDetail] = {}
32
33 with open(filepath, 'r') as file:
34 reader = csv.reader(file)
35 next(reader) # Skip the header line
36
37 line_count = 0
38 for line in reader:
39 if max_lines is not None and line_count >= max_lines:
40 break
41
42 line_count += 1
43
44 assert len(line) > 0
45
46 # Note that channel_name and encoding_start_time_ns are not yet used.
47 # They may be used here in the future, but for now they are helpful when
48 # directly looking at the csv file.
49 channel_name, channel_type, encode_duration_ns, encoding_start_time_ns, message_time_s = line
50
51 # Convert nanoseconds to milliseconds
52 encode_duration_ms = float(encode_duration_ns) * 1e-6
53 message_time_s = float(message_time_s)
54
55 if (start_time is not None and message_time_s < start_time):
56 continue
57 if (end_time is not None and message_time_s > end_time):
58 continue
59
60 if channel_type in data_by_type:
61 data_by_type[channel_type].encode_times_ms.append(
62 encode_duration_ms)
63 data_by_type[channel_type].event_loop_times_s.append(
64 message_time_s)
65 else:
66 data_by_type[channel_type] = SeriesDetail(
67 encode_times_ms=[encode_duration_ms],
68 event_loop_times_s=[message_time_s])
69
70 return data_by_type
71
72
73@dataclass
74class DataBin:
75 bin_range: str
76 message_encode_times: Dict[str, float]
77
78
79@dataclass
80class BinnedData:
81 bins: List[DataBin] = field(default_factory=list)
82 top_type_names: List[str] = field(default_factory=list)
83
84
85def create_binned_data(data_by_type: Dict[str, SeriesDetail],
86 num_bins: int = 25,
87 top_n: int = 5) -> BinnedData:
88 # Calculate total encoding times for each message type across the entire file.
89 total_encode_times: Dict[str, float] = {
90 message_type: sum(detail.encode_times_ms)
91 for message_type, detail in data_by_type.items()
92 }
93
94 # Determine the top N message types based on total encoding times.
95 top_types: List[Tuple[str, float]] = sorted(total_encode_times.items(),
96 key=lambda item: item[1],
97 reverse=True)[:top_n]
98 print(f"{top_types=}")
99 top_type_names: List[str] = [type_name for type_name, _ in top_types]
100
101 # Find the global minimum and maximum times to establish bin edges.
102 min_time: float = min(detail.event_loop_times_s[0]
103 for detail in data_by_type.values())
104 max_time: float = max(detail.event_loop_times_s[-1]
105 for detail in data_by_type.values())
106
107 # Create bins.
108 bins = np.linspace(min_time, max_time, num_bins + 1)
109
110 # Initialize the list of DataBin instances with the correct number of bins.
111 binned_data = BinnedData(top_type_names=top_type_names)
112 for i in range(num_bins):
113 bin_range = f"{bins[i]:.2f} - {bins[i+1]:.2f}"
114 binned_data.bins.append(
115 DataBin(bin_range=bin_range,
116 message_encode_times={
117 name: 0
118 for name in top_type_names + ['other']
119 }))
120
121 # Populate binned_data with message encode times.
122 for message_type, details in data_by_type.items():
123 binned_indices = np.digitize(details.event_loop_times_s, bins) - 1
124 # Correcting the bin indices that are out of range by being exactly on the maximum edge.
125 binned_indices = np.minimum(binned_indices, num_bins - 1)
126
127 for idx, encode_time in enumerate(details.encode_times_ms):
128 bin_index = binned_indices[idx]
129 current_bin = binned_data.bins[bin_index]
130 if message_type in top_type_names:
131 current_bin.message_encode_times[message_type] += encode_time
132 else:
133 current_bin.message_encode_times['other'] += encode_time
134
135 return binned_data
136
137
138def print_binned_data(binned_data: BinnedData) -> None:
139 # Extend headers for the table by replacing '.' with '\n.' for each message type name and
140 # adding 'Total'.
141 headers = ['Bin Range'] + [
142 key.replace('.', '\n.')
143 for key in binned_data.top_type_names + ['other']
144 ] + ['Total']
145
146 # Initialize the table data list.
147 table_data = []
148
149 # Populate the table data with the values from each DataBin instance and calculate totals.
150 for data_bin in binned_data.bins:
151 # Initialize a row with the bin range.
152 row = [data_bin.bin_range]
153 # Add the total message encode times for each message type.
154 encode_times = [
155 data_bin.message_encode_times[message_type]
156 for message_type in binned_data.top_type_names
157 ]
158 other_time = data_bin.message_encode_times['other']
159 row += encode_times + [other_time]
160 # Calculate the total encode time for the row and append it.
161 total_encode_time = sum(encode_times) + other_time
162 row.append(total_encode_time)
163 # Append the row to the table data.
164 table_data.append(row)
165
166 # Print the table using tabulate with 'grid' format for better structure.
167 print(
168 tabulate(table_data, headers=headers, tablefmt='grid', floatfmt=".2f"))
169
170
171def plot_bar(binned_data: BinnedData) -> None:
172 filename = "plot.html"
173 output_file(filename, title="Message Encode Time Plot Stacked Bar Graph")
174
175 # Adjust width based on bin count for readability.
176 plot_width = max(1200, 50 * len(binned_data.bins))
177
178 p = figure(x_range=[bin.bin_range for bin in binned_data.bins],
179 title='Message Encode Time by Type over Event Loop Time',
180 x_axis_label='Event Loop Time Bins',
181 y_axis_label='Total Message Encode Time (ms)',
182 width=plot_width,
183 height=600,
184 tools="")
185
186 source_data = {'bin_edges': [bin.bin_range for bin in binned_data.bins]}
187 for message_type in binned_data.top_type_names + ['other']:
188 source_data[message_type] = [
189 bin.message_encode_times.get(message_type, 0)
190 for bin in binned_data.bins
191 ]
192
193 source = ColumnDataSource(data=source_data)
194
195 # Calculate totals and sort in descending order.
196 totals = {
197 message_type: sum(source_data[message_type])
198 for message_type in source_data if message_type != 'bin_edges'
199 }
200 sorted_message_types = sorted(totals, key=totals.get, reverse=True)
201
202 # Reverse the list to place larger segments at the top.
203 sorted_message_types.reverse()
204
205 num_types = len(sorted_message_types)
206 if num_types > 20:
207 raise ValueError(
208 f"Number of types ({num_types}) exceeds the number of available colors in Category20."
209 )
210 colors = Category20[20][:num_types]
211
212 # Apply reversed order to stack rendering.
213 renderers = p.vbar_stack(sorted_message_types,
214 x='bin_edges',
215 width=0.7,
216 color=colors,
217 source=source)
218
219 # Change the orientation of the x-axis labels to a 45-degree angle.
220 p.xaxis.major_label_orientation = math.pi / 4
221
222 # Create a custom legend, maintaining the reversed order for visual consistency.
223 legend_items = [
224 LegendItem(label=mt.replace('.', ' '), renderers=[renderers[i]])
225 for i, mt in enumerate(sorted_message_types)
226 ]
227 legend = Legend(items=legend_items, location=(0, -30))
228 p.add_layout(legend, 'right')
229
230 p.y_range.start = 0
231 p.x_range.range_padding = 0.05
232 p.xgrid.grid_line_color = None
233 p.axis.minor_tick_line_color = None
234 p.outline_line_color = None
235
236 file_path = os.path.realpath(filename)
237 print('\n')
238 print(f"Plot saved to '{file_path}'")
239 webbrowser.open('file://' + file_path)
240
241
242def main():
243 parser = argparse.ArgumentParser(
244 description=
245 'Process log files to extract and plot message encode times.')
246 parser.add_argument('--log_file_path',
247 type=Path,
248 help='Path to the log file',
249 required=True)
250 parser.add_argument(
251 '--max_lines',
252 type=int,
253 default=None,
254 help='Maximum number of lines to read from the log file')
255 parser.add_argument('--num_bins',
256 type=int,
257 default=40,
258 help='Number of bins to use')
259 parser.add_argument('--top_n',
260 type=int,
261 default=10,
262 help='Number of top message types to plot')
263 parser.add_argument('--start_time',
264 type=float,
265 default=None,
266 help='Start time in seconds')
267 parser.add_argument('--end_time',
268 type=float,
269 default=None,
270 help='End time in seconds')
271
272 args = parser.parse_args()
273
274 data_by_type = parse_csv_file(filepath=args.log_file_path,
275 max_lines=args.max_lines,
276 start_time=args.start_time,
277 end_time=args.end_time)
278 binned_data = create_binned_data(data_by_type,
279 num_bins=args.num_bins,
280 top_n=args.top_n)
281 print_binned_data(binned_data)
282 plot_bar(binned_data)
283 print(f"{os.path.basename(__file__)} Finished.")
284
285
286if __name__ == '__main__':
287 main()