Training courses

Kernel and Embedded Linux

Bootlin training courses

Embedded Linux, kernel,
Yocto Project, Buildroot, real-time,
graphics, boot time, debugging...

Bootlin logo

Elixir Cross Referencer

#!/bin/sh
# SPDX-License-Identifier: GPL-2.0-only

. ./eeh-functions.sh

if ! eeh_supported ; then
	echo "EEH not supported on this system, skipping"
	exit 0;
fi

if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
   [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
	echo "debugfs EEH testing files are missing. Is debugfs mounted?"
	exit 1;
fi

pre_lspci=`mktemp`
lspci > $pre_lspci

# Bump the max freeze count to something absurd so we don't
# trip over it while breaking things.
echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes

# record the devices that we break in here. Assuming everything
# goes to plan we should get them back once the recover process
# is finished.
devices=""

# Build up a list of candidate devices.
for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
	# skip bridges since we can't recover them (yet...)
	if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
		echo "$dev, Skipped: bridge"
		continue;
	fi

	# Skip VFs for now since we don't have a reliable way
	# to break them.
	if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then
		echo "$dev, Skipped: virtfn"
		continue;
	fi

	# Don't inject errosr into an already-frozen PE. This happens with
	# PEs that contain multiple PCI devices (e.g. multi-function cards)
	# and injecting new errors during the recovery process will probably
	# result in the recovery failing and the device being marked as
	# failed.
	if ! pe_ok $dev ; then
		echo "$dev, Skipped: Bad initial PE state"
		continue;
	fi

	echo "$dev, Added"

	# Add to this list of device to check
	devices="$devices $dev"
done

dev_count="$(echo $devices | wc -w)"
echo "Found ${dev_count} breakable devices..."

failed=0
for dev in $devices ; do
	echo "Breaking $dev..."

	if ! pe_ok $dev ; then
		echo "Skipping $dev, Initial PE state is not ok"
		failed="$((failed + 1))"
		continue;
	fi

	if ! eeh_one_dev $dev ; then
		failed="$((failed + 1))"
	fi
done

echo "$failed devices failed to recover ($dev_count tested)"
lspci | diff -u $pre_lspci -
rm -f $pre_lspci

exit $failed