[Skiboot] [PATCH v9 13/22] core/pci: Support PCI slot

Wed Dec 2 11:16:56 AEDT 2015

Hi Gavin,

Comment are inline.

> +static void pci_slot_prepare_link_change(struct pci_slot *slot,
> +					 bool is_up)
> +{
> +	struct phb *phb = slot->phb;
> +	struct pci_device *pd = slot->pd;
> +	uint32_t aercap, mask;
> +

I think some overarching comments in this function would help.
I think what you're doing with AER is:

 1) find AER capability
 2) If we're going down, mask the suprise down bit, because we're
    expecting to go down (and it isn't a surprise). If we're coming up,
    clear that bit in the mask.
 3) If we're going down, mask the recvr_err bit (I'm not sure quite what
    this does, but I'm guessing it means reciever error of some sort) -
    because we expect to hit that when we go down. When we come up,
    clear that bit.

As a side question: If I remember correctly, an EEH event in Linux just
prints a set of bits without decoding them. Would it be worth a later
patch set to Linux to print a decoded explanation of the AER bits?
(again, not suggesting that you have to write that now!)

> +	/* Get AER capability position */
> +	if (pci_has_cap(pd, PCIECAP_ID_AER, true))
> +		aercap = pci_cap(pd, PCIECAP_ID_AER, true);
> +	else
> +		goto restore;
> +
> +	/* Link down event */
> +	pci_cfg_read32(phb, pd->bdfn,
> +		       aercap + PCIECAP_AER_UE_MASK, &mask);
> +	if (is_up)
> +		mask &= ~PCIECAP_AER_UE_MASK_SURPRISE_DOWN;
> +	else
> +		mask |= PCIECAP_AER_UE_MASK_SURPRISE_DOWN;
> +	pci_cfg_write32(phb, pd->bdfn,
> +			aercap + PCIECAP_AER_UE_MASK, mask);
> +
> +	/* Receiver error */
> +	pci_cfg_read32(phb, pd->bdfn,
> +		       aercap + PCIECAP_AER_CE_MASK, &mask);
> +	if (is_up)
> +		mask &= ~PCIECAP_AER_CE_RECVR_ERR;
> +	else
> +		mask |= PCIECAP_AER_CE_RECVR_ERR;
> +	pci_cfg_write32(phb, pd->bdfn,
> +			aercap + PCIECAP_AER_CE_MASK, mask);
> +
> +restore:
> +	if (is_up) {
> +		pci_restore_bridge_buses(phb, pd);
> +		if (phb->ops->device_init)
> +		    pci_walk_dev(phb, pd,
> +				 phb->ops->device_init, NULL);
> +	}
> +}
> +

> +struct pci_slot *pci_slot_alloc(struct phb *phb,
> +				struct pci_device *pd)
> +{
> +	struct pci_slot *slot = NULL;
> +
> +	/* PHB should be always valid */
> +	if (!phb)
> +		return NULL;
> +
> +	/*
> +	 * In case we already had one. If we allocate PHB
> +	 * slot, the passed 'pd' should be NULL. Otherwise,
> +	 * both 'phb' and 'pd' are all valid
> +	 */
> +	if (!pd)
> +		slot = phb->slot;
> +	else
> +		slot = pd->slot;
> +	if (slot) {
> +		PCI_SLOT_DBG(slot, "Already existing\n");
> +		return slot;
> +	}
> +
> +	/* Allocate memory chunk */
> +	slot = zalloc(sizeof(struct pci_slot));
> +	if (!slot) {
> +		prerror("Cannot allocate PCI slot\n");
> +		return NULL;
> +	}
> +
> +	/*
> +	 * Build the slot index, which might be combination of
> +	 * PHB index and device's indicator
> +	 */
> +	if (pd) {
> +		slot->id = pd->bdfn;
> +		slot->id = ((0x1ul << 63) | (slot->id << 16));
> +	}
> +	slot->id |= phb->opal_id;
> +
> +	/*
> +	 * Initialize the slot. The poll function is aleays
s/aleays/always/, although you explain it later anyway, so I don't think
you need the sentence here too.
> +	 * unified for all cases.
> +	 */
> +	slot->phb	= phb;
> +	slot->pd	= pd;
> +	pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
> +
> +	/*
> +	 * PCI slot state poller, which isn't expected to
> +	 * be overrided by individual platforms.
> +	 */
> +	slot->ops.poll			= pci_slot_sm_poll;
> +	slot->ops.prepare_link_change	= pci_slot_prepare_link_change;
> +
> +	/* The slot is belonged to PCI device or PHB */
s/is belonged to/belongs to/;
> +	if (pd)
> +		pd->slot = slot;
> +	else
> +		phb->slot = slot;
> +
> +	return slot;
> +}
> +
> +/*
> + * Find PCI slot. The ID might be combination of PHB and slot indexes, or
> + * single PHB index. We're using highest nibble as the indicator for that.
> + */
> +struct pci_slot *pci_slot_find(uint64_t id)
> +{
> +	struct phb *phb;
> +	struct pci_device *pd;
> +	struct pci_slot *slot = NULL;
> +	uint64_t phb_index;
> +	uint16_t bdfn;
> +
> +	if (id & 0xf000000000000000) {
> +		phb_index = (id & 0xfffful);
> +		bdfn = ((id >> 16) & 0xfffful);
> +	} else {
> +		phb_index = id;
> +	}
> +
> +	/* Search for PHB */
> +	phb = pci_get_phb(phb_index);
> +	if (phb)
> +		slot = phb->slot;

Do you need to bail out here if you don't have a PHB?
Otherwise you will pass NULL to pci_find_dev, which if I remember
correctly is not a case pci_find_dev expects...

> +
> +	/* Search for slot in case it's compound case */
> +	if (id & 0xf000000000000000) {
> +		pd = pci_find_dev(phb, bdfn);
> +		if (pd)
> +			slot = pd->slot;
> +	}
> +
> +	return slot;
> +}
> +
> +int pci_slot_hotplug_event(struct pci_slot *slot, bool add,
> +			   void (*consume)(void *data))
I think conventionally it would be s/consume/consumed/?
Apart from that this function looks good.

> +/*
> + * Turn off the power suply to the slot if there're nothing connected
> + * to it for 2 purposes: saving power obviously, and initializing the
> + * slot to initial power-off state for hotplug
> + */
> +static void pci_try_power_off_slot(struct phb *phb, struct pci_device *pd)
> +{
> +	uint32_t flags = (PCI_SLOT_FLAG_BOOTUP |
> +			  PCI_SLOT_FLAG_NO_HOTPLUG_MSG);
> +	int64_t rc;
> +
> +	/* Check if it's pluggable slot */
> +	if (!pd ||
> +	    !pd->slot ||
> +	    !pd->slot->pluggable ||
> +	    !pd->slot->ops.set_power_status ||
> +	    !pd->slot->ops.poll)
Is there any case ops.poll would be NULL? I thought that was set
generically and not expected to be overridden?

At a higher level, should it even be possible to override it?
Can we simplify the code by removing that possibility?
> +
> +	/* Check if there're something connected */
> +	if (!list_empty(&pd->children))
> +		return;
I initially though this should send a notice, but I see you're calling
it in lots of places where you only want it to turn off if there are no
children (e.g. initialising a slot), so this is OK.

> +	/*
> +	 * Power the slot off. We don't need the notification
> +	 * message about the device-tree node changes because
> +	 * we assume there're no devices behind the slot and
> +	 * their device-tree nodes aren't populated yet. So
> +	 * we needn't care the device-tree node changes at all.
> +	 */
If I understand correctly, you also know there are no devices behind the
slot because you've just checked there are no children.

> +	pci_slot_add_flags(pd->slot, flags);
> +	rc = pd->slot->ops.set_power_status(pd->slot, 0);
> +	while (rc > 0) {
> +		time_wait(rc);
> +		rc = pd->slot->ops.poll(pd->slot, NULL);
> +	}
>  
> -/* pci_scan - Perform a recursive scan of the bus at bus_number
> - *            populating the list passed as an argument. This also
> - *            performs the bus numbering, so it returns the largest
> - *            bus number that was assigned.
> +	pci_slot_remove_flags(pd->slot, flags);
> +	if (rc != OPAL_SUCCESS)
> +		PCINOTICE(phb, pd->bdfn, "Error %lld powering off slot\n", rc);
> +	else
> +		PCIDBG(phb, pd->bdfn, "Power off hotpluggable slot\n");
> +}
> +
> +/* Remove all subordinate PCI devices leading from the indicated
> + * PCI bus. It's used to remove all PCI devices behind one PCI
> + * slot at unplugging time
> + */
> +void pci_bus_remove(struct phb *phb, struct list_head *list)
> +{
> +	struct pci_device *pd, *tmp;
> +
> +	if (list_empty(list))
> +		return;
> +
> +	list_for_each_safe(list, pd, tmp, link) {
> +		pci_bus_remove(phb, &pd->children);
> +
> +		/* Release device node and PCI slot */
> +		if (pd->dn)
> +			dt_free(pd->dn);
> +		if (pd->slot)
> +			free(pd->slot);
> +
> +		/* Remove from parent list and release itself */
> +		list_del(&pd->link);
> +		free(pd);
> +	}
> +}
> +
> +/* Perform a recursive scan of the bus at bus_number populating
> + * the list passed as an argument. This also performs the bus
> + * numbering, so it returns the largest bus number that was
> + * assigned.
>   *
>   * Note: Eventually this might want to access some VPD information
>   *       in order to know what slots to scan and what not etc..
> @@ -458,9 +527,9 @@ static void pci_cleanup_bridge(struct phb *phb, struct pci_device *pd)
>   * XXX NOTE: We might also want to setup the PCIe MPS/MRSS properly
>   *           here as Linux may or may not do it
>   */
> -static uint8_t pci_scan(struct phb *phb, uint8_t bus, uint8_t max_bus,
> -			struct list_head *list, struct pci_device *parent,
> -			bool scan_downstream)
> +uint8_t pci_bus_scan(struct phb *phb, uint8_t bus, uint8_t max_bus,
> +		     struct list_head *list, struct pci_device *parent,
> +		     bool scan_downstream)
>  {
>  	struct pci_device *pd = NULL;
>  	uint8_t dev, fn, next_bus, max_sub, save_max;
> @@ -506,10 +575,15 @@ static uint8_t pci_scan(struct phb *phb, uint8_t bus, uint8_t
> max_bus,
>  	 * We only scan downstream if instructed to do so by the
>  	 * caller. Typically we avoid the scan when we know the
>  	 * link is down already, which happens for the top level
> -	 * root complex, and avoids a long secondary timeout
> +	 * root complex, and avoids a long secondary timeout. The
> +	 * power will be turned off if it's a empty hotpluggable
> +	 * slot.
>  	 */
> -	if (!scan_downstream)
> +	if (!scan_downstream) {
> +		list_for_each(list, pd, link)
> +			pci_try_power_off_slot(phb, pd);
>  		return bus;
> +	}
>  
>  	next_bus = bus + 1;
>  	max_sub = bus;
> @@ -577,8 +651,8 @@ static uint8_t pci_scan(struct phb *phb, uint8_t bus, uint8_t max_bus,
>  
>  		/* Perform recursive scan */
>  		if (do_scan) {
> -			max_sub = pci_scan(phb, next_bus, max_bus,
> -					   &pd->children, pd, true);
> +			max_sub = pci_bus_scan(phb, next_bus, max_bus,
> +					       &pd->children, pd, true);
>  		} else if (!use_max) {
>  			/* XXX Empty bridge... we leave room for hotplug
>  			 * slots etc.. but we should be smarter at figuring

The comment that git has cut off here reads:

                        /* XXX Empty bridge... we leave room for hotplug
                         * slots etc.. but we should be smarter at figuring
                         * out if this is actually a hotpluggable one
                         */
                        max_sub = next_bus + 4;
                        if (max_sub > max_bus)
                                max_sub = max_bus;

Given that we're doing hotplug stuff in this patch set, and we have
slot->pluggable, can we get rid of this XXX while we're here?

> +static int64_t pcie_slot_get_power_status(struct pci_slot *slot,
> +					  uint8_t *val)
> +{
> +	struct phb *phb = slot->phb;
> +	struct pci_device *pd = slot->pd;
> +	uint32_t ecap;
> +	uint16_t slot_ctl;
> +
> +	/*
> +	 * If we don't have power control functionality, the
> +	 * power is always on.
> +	 */
> +	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
> +	if (!(slot->slot_cap & PCICAP_EXP_SLOTCAP_PWCTRL)) {
> +		*val = PCI_SLOT_POWER_ON;
> +		return OPAL_SUCCESS;
> +	}
> +
> +	/* Check power supply bit */
> +	*val = PCI_SLOT_POWER_OFF;
> +	pci_cfg_read16(phb, pd->bdfn,
> +		       ecap + PCICAP_EXP_SLOTCTL, &slot_ctl);
> +	if (!(slot_ctl & PCICAP_EXP_SLOTCTL_PWRCTLR))
Just checking: this is supposed to be negated? The SLOTCTL_PWRCTLR bit
is high if the slot is off? (That would make sense for PCI tending to
float high, but I thought I'd check.)
> +		*val = PCI_SLOT_POWER_ON;
> +
> +	return OPAL_SUCCESS;
> +}
> +

> +
> +static int64_t pcie_slot_set_attention_status(struct pci_slot *slot,
> +					      uint8_t val)
> +{
> +	struct phb *phb = slot->phb;
> +	struct pci_device *pd = slot->pd;
> +	uint32_t ecap;
> +	uint16_t slot_ctl;
> +
> +	/*
> +	 * Drop the request if the slot doesn't support
> +	 * attention capability
> +	 */
> +	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
> +	if (!(slot->slot_cap & PCICAP_EXP_SLOTCAP_ATTNI))
> +		return OPAL_SUCCESS;
> +
> +	/* Write the attention bits */
> +	pci_cfg_read16(phb, pd->bdfn,
> +		       ecap + PCICAP_EXP_SLOTCTL, &slot_ctl);
> +	switch (val) {
> +	case PCI_SLOT_ATTN_LED_OFF:
> +		slot_ctl = SETFIELD(PCICAP_EXP_SLOTCTL_ATTNI,
> +				    slot_ctl, PCIE_INDIC_OFF);
> +		break;
> +	case PCI_SLOT_ATTN_LED_ON:
> +		slot_ctl = SETFIELD(PCICAP_EXP_SLOTCTL_ATTNI,
> +				    slot_ctl, PCIE_INDIC_ON);
> +		break;
> +	case PCI_SLOT_ATTN_LED_BLINK:
> +		slot_ctl = SETFIELD(PCICAP_EXP_SLOTCTL_ATTNI,
> +				    slot_ctl, PCIE_INDIC_BLINK);
> +		break;
> +	}
> +
> +	pci_cfg_write16(phb, pd->bdfn, ecap + PCICAP_EXP_SLOTCTL,
> +			slot_ctl);
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t pcie_slot_set_power_status(struct pci_slot *slot,
> +					  uint8_t val)
> +{
> +	struct phb *phb = slot->phb;
> +	struct pci_device *pd = slot->pd;
> +	uint32_t ecap;
> +	uint16_t slot_ctl;
> +
> +	/*
> +	 * Drop the request if the slot doesn't support
> +	 * power control functionality.
> +	 */
> +	ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
> +	if (!(slot->slot_cap & PCICAP_EXP_SLOTCAP_PWCTRL))
> +		return OPAL_SUCCESS;

Is there any case we might have an issue if the OS thinks power is off
when it isn't off? I can't think of any compelling examples off the top
of my head - hotplug won't work, but a slot that doesn't support power
control shouldn't support hotplug, so that shouldn't matter...
Anyway, not saying this needs to change, just flagging that I'm not
entirely comfortable with returning OPAL_SUCCESS here and I'd like to be
convinced it's OK.

> +
> +	/* Set power supply bits */
> +	pci_cfg_read16(phb, pd->bdfn,
> +		       ecap + PCICAP_EXP_SLOTCTL, &slot_ctl);
> +	switch (val) {
> +	case PCI_SLOT_POWER_OFF:
> +		slot_ctl |= PCICAP_EXP_SLOTCTL_PWRCTLR;
> +		slot_ctl = SETFIELD(PCICAP_EXP_SLOTCTL_PWRI, slot_ctl,
> +				    PCIE_INDIC_OFF);
> +		break;
> +	case PCI_SLOT_POWER_ON:
> +		slot_ctl &= ~PCICAP_EXP_SLOTCTL_PWRCTLR;
> +		slot_ctl = SETFIELD(PCICAP_EXP_SLOTCTL_PWRI, slot_ctl,
> +				    PCIE_INDIC_ON);
> +		break;
> +	}
> +
> +	pci_cfg_write16(phb, pd->bdfn,
> +			ecap + PCICAP_EXP_SLOTCTL, slot_ctl);
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t pcie_slot_sm_poll_link(struct pci_slot *slot)
> +{
> +	struct phb *phb = slot->phb;
> +	struct pci_device *pd = slot->pd;
> +	uint32_t ecap;
> +	uint16_t val;
> +	uint8_t presence = 0;
> +
> +	switch (slot->state) {
> +	case PCI_SLOT_STATE_LINK_START_POLL:
> +		PCIE_SLOT_DBG(slot, "LINK: Start polling\n");
> +
> +		/*
> +		 * The link won't come up for ever if the slot
I think you mean "The link won't ever come up if the slot doesn't have a
connected adapter"?
> +		 * doesn't have an connected adapter
> +		 */
> +		if (slot->ops.get_presence_status)
> +			slot->ops.get_presence_status(slot, &presence);
> +		if (!presence) {
> +			PCIE_SLOT_DBG(slot, "LINK: No adapter, end polling\n");
> +			if (slot->ops.prepare_link_change)
> +				slot->ops.prepare_link_change(slot, true);
> +			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
> +			return OPAL_SUCCESS;
> +		}
If this gets called with state = PCI_SLOT_STATE_LINK_START_POLL, and
with slot->ops.get_presence_status == NULL, nothing will change the
presence bit, and the code will assume there is no adapter. However, in
pcie_slot_get_presence_status above, you suggest that there's a case
where the bit should be hardcoded to PRESENT. Does this code need to
take that into consideration?

Alternatively, could you make it illegal to have a pci_slot_ops struct
without a get_presence_status function? Then any PHB that doesn't
implement it in hardware can implement a stub function to take
responsiblity for making this decision correctly.

> +
> +		/* Enable the link without check */
> +		ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
> +		pci_cfg_read16(phb, pd->bdfn,
> +			       ecap + PCICAP_EXP_LCTL, &val);
> +		val &= ~PCICAP_EXP_LCTL_LINK_DIS;
> +		pci_cfg_write16(phb, pd->bdfn,
> +				ecap + PCICAP_EXP_LCTL, val);
> +
> +		/*
> +		 * If the slot doesn't support link change report
> +		 * capability, we assume the link state is finalized
> +		 * after 1 second.
> +		 */
> +		if (!(slot->link_cap & PCICAP_EXP_LCAP_DL_ACT_REP)) {
> +			pci_slot_set_state(slot, PCI_SLOT_STATE_LINK_DELAY_FINALIZED);
> +			return pci_slot_set_sm_timeout(slot, secs_to_tb(1));
> +		}
> +
> +		/*
> +		 * If the slot supports reporting link state change,
> +		 * we need poll it in determined interval and it's
> +		 * timeout would be 5s.
> +		 */
> +		pci_slot_set_state(slot, PCI_SLOT_STATE_LINK_POLLING);
> +		slot->retries = 250;
> +		return pci_slot_set_sm_timeout(slot, msecs_to_tb(20));
> +	case PCI_SLOT_STATE_LINK_DELAY_FINALIZED:
> +		/*
> +		 * We assume the link state is finalized after
> +		 * determined time delay.
> +		 */
> +		PCIE_SLOT_DBG(slot, "LINK: No link report, end polling\n");
> +		if (slot->ops.prepare_link_change)
> +			slot->ops.prepare_link_change(slot, true);
> +		pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
> +		return OPAL_SUCCESS;
> +	case PCI_SLOT_STATE_LINK_POLLING:
> +		ecap = pci_cap(pd, PCI_CFG_CAP_ID_EXP, false);
> +		pci_cfg_read16(phb, pd->bdfn,
> +			       ecap + PCICAP_EXP_LSTAT, &val);
> +		if (val & PCICAP_EXP_LSTAT_DLLL_ACT) {
> +			PCIE_SLOT_DBG(slot, "LINK: Link is up, end polling\n");
> +			if (slot->ops.prepare_link_change)
> +				slot->ops.prepare_link_change(slot, true);
> +			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
> +			return OPAL_SUCCESS;
> +		}
> +
> +		/* Check link state again until timeout */
> +		if (slot->retries-- == 0) {
> +			PCIE_SLOT_DBG(slot, "LINK: Timeout waiting for up (%04x)\n",
> +				      val);
> +			if (slot->ops.prepare_link_change)
> +				slot->ops.prepare_link_change(slot, true);
> +			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
I was initially a bit nervous about preparing for a link_change up and
setting the state to NORMAL if we've been polling and haven't got a
positive result.

Then I realised that we don't necessarily have an accurate presence bit,
so we could be trying to bring up an empty slot. In that case, we don't
want to print a noisy error. However, if we do know that there's a
device in the slot, and we support link state, and we haven't come up,
shouldn't we print a NOTICE or ERROR and put the slot in error state?

> +			return OPAL_SUCCESS;
> +		}
> +
> +		return pci_slot_set_sm_timeout(slot, msecs_to_tb(20));
> +	default:
> +		PCIE_SLOT_DBG(slot, "LINK: Unexpected slot state %08x\n",
> +			      slot->state);
Should this print a noisier error message? (i.e. not at PR_DEBUG) We
would only get here in the case of a skiboot bug, so we'd probably want
to know...
> +	}
> +
> +	pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
> +	return OPAL_HARDWARE;
> +}
> +

> +
> +static int64_t pcie_slot_sm_hreset(struct pci_slot *slot)
> +{
> +	/*
> +	 * After power-on in fundamental reset, we will
> +	 * switch to hot reset. The case is coverred by
What is this case covered by? It looks like you didn't finish your
sentence?
> +	 */
> +	switch (slot->state) {
> +	case PCI_SLOT_STATE_NORMAL:
> +		PCIE_SLOT_DBG(slot, "HRESET: Starts\n");
> +		if (slot->ops.prepare_link_change) {
> +			PCIE_SLOT_DBG(slot, "HRESET: Prepare for link down\n");
> +			slot->ops.prepare_link_change(slot, false);
> +		}
> +		/* fall through */
> +	case PCI_SLOT_STATE_HRESET_START:
> +		PCIE_SLOT_DBG(slot, "HRESET: Assert\n");
> +		pcie_slot_reset(slot, true);
> +		pci_slot_set_state(slot, PCI_SLOT_STATE_HRESET_HOLD);
> +		return pci_slot_set_sm_timeout(slot, msecs_to_tb(250));
> +	case PCI_SLOT_STATE_HRESET_HOLD:
> +		PCIE_SLOT_DBG(slot, "HRESET: Deassert\n");
> +		pcie_slot_reset(slot, false);
> +		pci_slot_set_state(slot, PCI_SLOT_STATE_LINK_START_POLL);
> +		return pci_slot_set_sm_timeout(slot, msecs_to_tb(1800));
> +	default:
> +		PCIE_SLOT_DBG(slot, "HRESET: Unexpected slot state %08x\n",
> +			      slot->state);
Likewise, should this be at greater than PR_DEBUG?
> +	}
> +
> +	pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
> +	return OPAL_HARDWARE;
> +}
> +
> +/*
> + * Usually, individual platforms need override the power
> + * management methods for fundamental reset because the
> + * hot reset is commonly shared.
> + */
I'm not sure I understand this comment. Could you clarify it?
> +static int64_t pcie_slot_sm_freset(struct pci_slot *slot)
> +{
> +	uint8_t power_state = PCI_SLOT_POWER_ON;
> +
> +	switch (slot->state) {
> +	case PCI_SLOT_STATE_NORMAL:
I was going to suggest we a fundamental reset in a non-NORMAL state, in
particular in error states, but then I realised that you haven't defined
any PCI_SLOT_STATEs for slots in confused/error/unknown/broken state.

I'm in two minds about whether that's a good idea. On one hand, putting
the slot in a CONFUSED state if we ever get to an unexpected state in
one of our state machines would help us get skiboot more robust. On the
other hand, it might make it somewhat more fragile in real world use, as
it would deactivate a slot that might still be working fine. I'm
especially worried about this given that we don't have a great test
suite for PCI.

Thinking out loud here... (Gavin, I don't expect you to actually do any
of this at this point!) I suppose you could have STATE_CONFUSED that is
defined to something different for debug builds and is defined to the
same as PCI_SLOT_STATE_NORMAL for release builds. However, it's possibly
not an excellent investment of time in hardening skiboot relative to
building a script that does a whole range of weird PCI stuff or even
trying to formally prove the properties of some of our state
machines. If anyone has thoughts I'd be interested.

> +		PCIE_SLOT_DBG(slot, "FRESET: Starts\n");
> +		if (slot->ops.prepare_link_change)
> +			slot->ops.prepare_link_change(slot, false);
> +
> +		/* Retrieve power state */
> +		if (slot->ops.get_power_status) {
> +			PCIE_SLOT_DBG(slot, "FRESET: Retrieve power state\n");
> +			slot->ops.get_power_status(slot, &power_state);
> +		}
> +

I'm a little bit nervous about this entire state machine.

 - It looks to me like it will 'fake' a full FRESET in the case where
   get_power_status and set_power_status are not defined. Maybe you
   catch this in whatever calls this state machine - I haven't checked
   in great detail - but it looks like it's called by opal_pci_reset and
   the only check is that ops.freset is defined. In that case, you are
   relying on the platform to override it by NULLing out the pointer if
   it's not supported. I'm not really comfortable with this: it's not
   really obvious that a platform would need to do this.

   Is there any reason this can't check against slot->power_ctl and
   return OPAL_UNSUPPORTED if it's not 1?

 - It tests for the get_power_status and set_power_status separately. Is
   there any possible scenario where only one would be implemented? Can
   the code assume that if slot->power_ctl is set then the functions are
   implemented? In other words, can we require a platform either:
     * set power_ctl = 1 and implement get_/set_power

     * set power_ctl = 0 and be guaranteed that no-one will try to call
       get_/set_power

> +		/* In power on state, power it off */
> +		if (power_state == PCI_SLOT_POWER_ON &&
> +		    slot->ops.set_power_status) {
> +			PCIE_SLOT_DBG(slot, "FRESET: Power is on, turn off\n");
> +			slot->ops.set_power_status(slot,
> +						   PCI_SLOT_POWER_OFF);
> +			pci_slot_set_state(slot,
> +					   PCI_SLOT_STATE_FRESET_POWER_OFF);
> +			return pci_slot_set_sm_timeout(slot, msecs_to_tb(50));
> +		}
> +		/* No power state change, fall through */
> +	case PCI_SLOT_STATE_FRESET_POWER_OFF:
> +		PCIE_SLOT_DBG(slot, "FRESET: Power is off, turn on\n");
> +		if (slot->ops.set_power_status)
> +			slot->ops.set_power_status(slot,
> +						   PCI_SLOT_POWER_ON);
> +		pci_slot_set_state(slot, PCI_SLOT_STATE_HRESET_START);
> +		return pci_slot_set_sm_timeout(slot, msecs_to_tb(50));
> +	default:
> +		PCIE_SLOT_DBG(slot, "FRESET: Unexpected slot state %08x\n",
> +			      slot->state);
> +	}
> +
> +	pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
> +	return OPAL_HARDWARE;
> +}
> +

> +/*
> + * We have hard and soft reset for slot. Hard reset requires
> + * power-off and then power-on, but soft reset only resets
> + * secondary bus.
> + */
> +struct pci_slot;
> +struct pci_slot_ops {
> +	/* For slot management */
> +	int64_t (*get_presence_status)(struct pci_slot *slot, uint8_t *val);
> +	int64_t (*get_link_status)(struct pci_slot *slot, uint8_t *val);
> +	int64_t (*get_power_status)(struct pci_slot *slot, uint8_t *val);
> +	int64_t (*get_attention_status)(struct pci_slot *slot, uint8_t *val);
> +	int64_t (*get_latch_status)(struct pci_slot *slot, uint8_t *val);
> +	int64_t (*set_power_status)(struct pci_slot *slot, uint8_t val);
> +	int64_t (*set_attention_status)(struct pci_slot *slot, uint8_t val);
> +
> +	/* SM based functions for reset */
> +	void (*prepare_link_change)(struct pci_slot *slot, bool is_up);
> +	int64_t (*poll_link)(struct pci_slot *slot);
> +	int64_t (*creset)(struct pci_slot *slot);
> +	int64_t (*freset)(struct pci_slot *slot);
> +	int64_t (*pfreset)(struct pci_slot *slot);
> +	int64_t (*hreset)(struct pci_slot *slot);
> +	int64_t (*poll)(struct pci_slot *slot, uint8_t *val);
As mentioned earlier, if poll is unified, does it need to be here?

> +
> +	/* Auxillary functions */
> +	void (*add_properties)(struct pci_slot *slot, struct dt_node *np);
> +};
> +

> diff --git a/include/platform.h b/include/platform.h
> index f1bdc30..e77bbe1 100644
> --- a/include/platform.h
> +++ b/include/platform.h
> @@ -20,6 +20,7 @@
>  /* Some fwd declarations for types used further down */
>  struct phb;
>  struct pci_device;
> +struct pci_slot;
>  struct errorlog;
>  
>  enum resource_id {
> @@ -72,7 +73,7 @@ struct platform {
>  
>  	/*
>  	 * This is called once per PHB before probing. It allows the
> -	 * platform to setup some PHB private data that can be used
> +	* platform to setup some PHB private data that can be used
It looks like you accidentally deleted a space here?

>  	 * later on by calls such as pci_get_slot_info() below. The
>  	 * "index" argument is the PHB index within the IO HUB (or
>  	 * P8 chip).
> diff --git a/platforms/ibm-fsp/lxvpd.c b/platforms/ibm-fsp/lxvpd.c
> index 90c9e09..542c49a 100644
> --- a/platforms/ibm-fsp/lxvpd.c
> +++ b/platforms/ibm-fsp/lxvpd.c
> @@ -24,6 +24,7 @@
>  #include <vpd.h>
>  #include <pci.h>
>  #include <pci-cfg.h>
> +#include <pci-slot.h>
>  
>  #include "lxvpd.h"
>  
> -- 
> 2.1.0
>
> _______________________________________________
> Skiboot mailing list
> Skiboot at lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 859 bytes
Desc: not available
URL: <http://lists.ozlabs.org/pipermail/skiboot/attachments/20151202/aa7758d8/attachment.sig>