# Copyright (c) 2016 The University of Manchester
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from spinn_utilities.progress_bar import ProgressBar
from spinn_utilities.log import FormatAdapter
from spinnman.exceptions import SpinnmanException
from spinn_front_end_common.data import FecDataView
from spinn_front_end_common.interface.provenance import ProvenanceWriter
logger = FormatAdapter(logging.getLogger(__name__))
[docs]def router_provenance_gatherer():
gather = _RouterProvenanceGatherer()
# pylint: disable=protected-access
gather._add_router_provenance_data()
class _RouterProvenanceGatherer(object):
"""
Gathers diagnostics from the routers.
"""
__slots__ = []
def _add_router_provenance_data(self):
"""
Writes the provenance data of the router diagnostics
"""
progress = ProgressBar(FecDataView.get_machine().n_chips*2,
"Getting Router Provenance")
seen_chips = set()
# get all extra monitor core data if it exists
reinjection_data = None
if FecDataView.has_monitors():
monitor = FecDataView.get_monitor_by_xy(0, 0)
reinjection_data = monitor.get_reinjection_status_for_vertices()
for router_table in progress.over(
FecDataView.get_uncompressed().routing_tables, False):
seen_chips.add(self._add_router_table_diagnostic(
router_table, reinjection_data))
# Get what info we can for chips where there are problems or no table
for chip in progress.over(sorted(
FecDataView.get_machine().chips, key=lambda c: (c.x, c.y))):
if (chip.x, chip.y) not in seen_chips:
self._add_unseen_router_chip_diagnostic(
chip, reinjection_data)
def _add_router_table_diagnostic(self, table, reinjection_data):
"""
:param ~.MulticastRoutingTable table:
:param dict(tuple(int,int),ReInjectionStatus) reinjection_data:
"""
x = table.x
y = table.y
try:
transceiver = FecDataView.get_transceiver()
diagnostics = transceiver.get_router_diagnostics(x, y)
except SpinnmanException:
logger.warning(
"Could not read routing diagnostics from {}, {}",
x, y, exc_info=True)
return
status = self.__get_status(reinjection_data, x, y)
self.__router_diagnostics(x, y, diagnostics, status, True, table)
return x, y
def _add_unseen_router_chip_diagnostic(self, chip, reinjection_data):
"""
:param ~.Chip chip:
:param dict(tuple(int,int),ReInjectionStatus) reinjection_data:
"""
try:
transceiver = FecDataView.get_transceiver()
diagnostics = transceiver.get_router_diagnostics(chip.x, chip.y)
except SpinnmanException:
# There could be issues with unused chips - don't worry!
return
if (diagnostics.n_dropped_multicast_packets or
diagnostics.n_local_multicast_packets or
diagnostics.n_external_multicast_packets):
status = self.__get_status(reinjection_data, chip.x, chip.y)
self.__router_diagnostics(
chip.x, chip.y, diagnostics, status, False, None)
@staticmethod
def __get_status(reinjection_data, x, y):
"""
:param dict(tuple(int,int),ReInjectionStatus) reinjection_data:
:param int x:
:param int y:
:rtype: ReInjectionStatus or None
"""
return reinjection_data[x, y] if reinjection_data else None
def __router_diagnostics(self, x, y, diagnostics, status, expected, table):
"""
Describes the router diagnostics for one router.
:param int x: x coordinate of the router in question
:param int y: y coordinate of the router in question
:param ~.RouterDiagnostics diagnostics: the router diagnostics object
:param ReInjectionStatus status:
the data gained from the extra monitor re-injection subsystem
:param bool expected:
:param ~.AbstractMulticastRoutingTable table:
the router table generated by the PACMAN tools
"""
# pylint: disable=too-many-arguments
# simplify the if by making components of it outside.
has_dropped = (diagnostics.n_dropped_multicast_packets > 0)
missing_stuff = False
has_reinjection = status is not None
if has_reinjection:
missing_stuff = ((
status.n_dropped_packets + status.n_missed_dropped_packets +
status.n_dropped_packet_overflows +
status.n_reinjected_packets + status.n_processor_dumps +
status.n_link_dumps) < diagnostics.n_dropped_multicast_packets)
with ProvenanceWriter() as db:
db.insert_router(
x, y, "Local_Multicast_Packets",
diagnostics.n_local_multicast_packets, expected)
db.insert_router(
x, y, "External_Multicast_Packets",
diagnostics.n_external_multicast_packets, expected)
db.insert_router(
x, y, "Dropped_Multicast_Packets",
diagnostics.n_dropped_multicast_packets, expected)
if (has_dropped and not has_reinjection) or (
has_dropped and has_reinjection and missing_stuff):
db.insert_report(
f"The router on {x}, {y} has dropped "
f"{diagnostics.n_dropped_multicast_packets} "
f"multicast route packets. "
f"Try increasing the machine_time_step and/or the time "
f"scale factor or reducing the number of atoms per core.")
db.insert_router(
x, y, "Dropped_Multicast_Packets_via_local_transmission",
diagnostics.user_3, expected)
if diagnostics.user_3 > 0:
db.insert_report(
f"The router on {x}, {y} has dropped {diagnostics.user_3} "
"multicast packets that were transmitted by local cores. "
"This occurs where the router has no entry associated "
"with the multicast key. "
"Try investigating the keys allocated to the vertices "
"and the router table entries for this chip.")
db.insert_router(
x, y, "default_routed_external_multicast_packets",
diagnostics.user_2, expected)
if diagnostics.user_2 > 0 and not (
table and table.number_of_defaultable_entries):
db.insert_report(
f"The router on {x}, {y} has default routed "
f"{diagnostics.user_2} multicast packets, but the router "
f"table did not expect any default routed packets. "
f"This occurs where the router has no entry associated "
f"with the multicast key. "
f"Try investigating the keys allocated to the vertices "
f"and the router table entries for this chip.")
if table:
db.insert_router(
x, y, "Entries", table.number_of_entries, expected)
routes = set()
for ent in table.multicast_routing_entries:
routes.add(ent.spinnaker_route)
db.insert_router(x, y, "Unique_Routes", len(routes), expected)
db.insert_router(
x, y, "Local_P2P_Packets",
diagnostics.n_local_peer_to_peer_packets, expected)
db.insert_router(
x, y, "External_P2P_Packets",
diagnostics.n_external_peer_to_peer_packets, expected)
db.insert_router(
x, y, "Dropped_P2P_Packets",
diagnostics.n_dropped_peer_to_peer_packets, expected)
db.insert_router(
x, y, "Local_NN_Packets",
diagnostics.n_local_nearest_neighbour_packets, expected)
db.insert_router(
x, y, "External_NN_Packets",
diagnostics.n_external_nearest_neighbour_packets, expected)
db.insert_router(
x, y, "Dropped_NN_Packets",
diagnostics.n_dropped_nearest_neighbour_packets, expected)
db.insert_router(
x, y, "Local_FR_Packets",
diagnostics.n_local_fixed_route_packets, expected)
db.insert_router(
x, y, "External_FR_Packets",
diagnostics.n_external_fixed_route_packets, expected)
db.insert_router(
x, y, "Dropped_FR_Packets",
diagnostics.n_dropped_fixed_route_packets, expected)
if diagnostics.n_dropped_fixed_route_packets > 0:
db.insert_report(
f"The router on chip {x}:{y} dropped "
f"{diagnostics.n_dropped_fixed_route_packets} fixed "
f"route packets. "
f"This is indicative of an error within the data "
f"extraction process as this is the only expected user of "
"fixed route packets.")
db.insert_router(
x, y, "Error status", diagnostics.error_status, expected)
if diagnostics.error_status > 0:
db.insert_report(
f"The router on {x}, {y} has a non-zero error status. "
f"This could indicate a hardware fault. "
f"The errors set are {diagnostics.errors_set}, and the "
f"error count is {diagnostics.error_count}")
if status is None:
return # rest depends on status
db.insert_router(
x, y, "Received_For_Reinjection",
status.n_dropped_packets, expected)
db.insert_router(
x, y, "Missed_For_Reinjection",
status.n_missed_dropped_packets, expected)
if status.n_missed_dropped_packets > 0:
db.insert_report(
f"The extra monitor on {x}, {y} has missed "
f"{status.n_missed_dropped_packets} packets.")
db.insert_router(
x, y, "Reinjection_Overflows",
status.n_dropped_packet_overflows, expected,)
if status.n_dropped_packet_overflows > 0:
db.insert_report(
f"The extra monitor on {x}, {y} has dropped "
f"{status.n_dropped_packet_overflows} packets.")
db.insert_router(
x, y, "Reinjected", status.n_reinjected_packets, expected)
db.insert_router(
x, y, "Dumped_from_a_Link", status.n_link_dumps, expected)
if status.n_link_dumps > 0:
db.insert_report(
f"The extra monitor on {x}, {y} has detected that "
f"{status.n_link_dumps} packets were dumped from "
f"outgoing links {status.links_dropped_from} of this "
f"chip's router. This often occurs "
f"when external devices are used in the script but not "
f"connected to the communication fabric correctly. "
f"These packets may have been reinjected multiple times "
f"and so this number may be an overestimate.")
db.insert_router(
x, y, "Dumped_from_a_processor", status.n_processor_dumps,
expected)
if status.n_processor_dumps > 0:
db.insert_report(
f"The extra monitor on {x}, {y} has detected that "
f"{status.n_processor_dumps} packets were dumped from "
f"cores {status.processors_dropped_from} failing to take "
"the packet. This often occurs when "
"the executable has crashed or has not been given a "
"multicast packet callback. It can also result from the "
"core taking too long to process each packet. These "
"packets were reinjected and so this number is likely an "
"overestimate.")