Discussion:
[PATCH V7 01/17] PCI/IOV: Export interface for retrieve VF's BDF
Wei Yang
2014-07-24 06:22:11 UTC
Permalink
When implementing the SR-IOV on PowerNV platform, some resource reservation is
needed for VFs which don't exist at the bootup stage. To do the match between
resources and VFs, the code need to get the VF's BDF in advance.

In this patch, it exports the interface to retrieve VF's BDF:
* Make the virtfn_bus as an interface
* Make the virtfn_devfn as an interface
* rename them with more specific name
* code cleanup in pci_sriov_resource_alignment()

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
drivers/pci/iov.c | 26 +++++++-------------------
drivers/pci/pci.h | 19 -------------------
include/linux/pci.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 51 insertions(+), 38 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index cb6f247..7566238 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -19,18 +19,6 @@

#define VIRTFN_ID_LEN 16

-static inline u8 virtfn_bus(struct pci_dev *dev, int id)
-{
- return dev->bus->number + ((dev->devfn + dev->sriov->offset +
- dev->sriov->stride * id) >> 8);
-}
-
-static inline u8 virtfn_devfn(struct pci_dev *dev, int id)
-{
- return (dev->devfn + dev->sriov->offset +
- dev->sriov->stride * id) & 0xff;
-}
-
static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
{
struct pci_bus *child;
@@ -69,7 +57,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
struct pci_bus *bus;

mutex_lock(&iov->dev->sriov->lock);
- bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id));
+ bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
if (!bus)
goto failed;

@@ -77,7 +65,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
if (!virtfn)
goto failed0;

- virtfn->devfn = virtfn_devfn(dev, id);
+ virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
virtfn->vendor = dev->vendor;
pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
pci_setup_device(virtfn);
@@ -140,8 +128,8 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
struct pci_sriov *iov = dev->sriov;

virtfn = pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus),
- virtfn_bus(dev, id),
- virtfn_devfn(dev, id));
+ pci_iov_virtfn_bus(dev, id),
+ pci_iov_virtfn_devfn(dev, id));
if (!virtfn)
return;

@@ -216,7 +204,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
iov->offset = offset;
iov->stride = stride;

- if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->busn_res.end) {
+ if (pci_iov_virtfn_bus(dev, nr_virtfn - 1) > dev->bus->busn_res.end) {
dev_err(&dev->dev, "SR-IOV: bus number out of range\n");
return -ENOMEM;
}
@@ -516,7 +504,7 @@ resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno)
if (!reg)
return 0;

- __pci_read_base(dev, type, &tmp, reg);
+ __pci_read_base(dev, type, &tmp, reg);
return resource_alignment(&tmp);
}

@@ -546,7 +534,7 @@ int pci_iov_bus_range(struct pci_bus *bus)
list_for_each_entry(dev, &bus->devices, bus_list) {
if (!dev->is_physfn)
continue;
- busnr = virtfn_bus(dev, dev->sriov->total_VFs - 1);
+ busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1);
if (busnr > max)
max = busnr;
}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 0601890..a3158b2 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -221,25 +221,6 @@ static inline int pci_ari_enabled(struct pci_bus *bus)
void pci_reassigndev_resource_alignment(struct pci_dev *dev);
void pci_disable_bridge_window(struct pci_dev *dev);

-/* Single Root I/O Virtualization */
-struct pci_sriov {
- int pos; /* capability position */
- int nres; /* number of resources */
- u32 cap; /* SR-IOV Capabilities */
- u16 ctrl; /* SR-IOV Control */
- u16 total_VFs; /* total VFs associated with the PF */
- u16 initial_VFs; /* initial VFs associated with the PF */
- u16 num_VFs; /* number of VFs available */
- u16 offset; /* first VF Routing ID offset */
- u16 stride; /* following VF stride */
- u32 pgsz; /* page size for BAR alignment */
- u8 link; /* Function Dependency Link */
- u16 driver_max_VFs; /* max num VFs driver supports */
- struct pci_dev *dev; /* lowest numbered PF */
- struct pci_dev *self; /* this PF */
- struct mutex lock; /* lock for VF bus */
-};
-
#ifdef CONFIG_PCI_ATS
void pci_restore_ats_state(struct pci_dev *dev);
#else
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 466bcd1..194db52 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -245,6 +245,27 @@ struct pci_vpd;
struct pci_sriov;
struct pci_ats;

+/* Single Root I/O Virtualization */
+struct pci_sriov {
+ int pos; /* capability position */
+ int nres; /* number of resources */
+ u32 cap; /* SR-IOV Capabilities */
+ u16 ctrl; /* SR-IOV Control */
+ u16 total_VFs; /* total VFs associated with the PF */
+ u16 initial_VFs; /* initial VFs associated with the PF */
+ u16 num_VFs; /* number of VFs available */
+ u16 offset; /* first VF Routing ID offset */
+ u16 stride; /* following VF stride */
+ u32 pgsz; /* page size for BAR alignment */
+ u8 link; /* Function Dependency Link */
+ u16 driver_max_VFs; /* max num VFs driver supports */
+ struct pci_dev *dev; /* lowest numbered PF */
+ struct pci_dev *self; /* this PF */
+ struct mutex lock; /* lock for VF bus */
+ struct work_struct mtask; /* VF Migration task */
+ u8 __iomem *mstate; /* VF Migration State Array */
+};
+
/*
* The pci_dev structure is used to describe PCI devices.
*/
@@ -1616,6 +1637,21 @@ int pci_ext_cfg_avail(void);
void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);

#ifdef CONFIG_PCI_IOV
+static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id)
+{
+ if (!dev->is_physfn)
+ return -EINVAL;
+ return dev->bus->number + ((dev->devfn + dev->sriov->offset +
+ dev->sriov->stride * id) >> 8);
+}
+static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
+{
+ if (!dev->is_physfn)
+ return -EINVAL;
+ return (dev->devfn + dev->sriov->offset +
+ dev->sriov->stride * id) & 0xff;
+}
+
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
void pci_disable_sriov(struct pci_dev *dev);
int pci_num_vf(struct pci_dev *dev);
@@ -1623,6 +1659,14 @@ int pci_vfs_assigned(struct pci_dev *dev);
int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
int pci_sriov_get_totalvfs(struct pci_dev *dev);
#else
+static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id)
+{
+ return -ENXIO;
+}
+static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
+{
+ return -ENXIO;
+}
static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{ return -ENODEV; }
static inline void pci_disable_sriov(struct pci_dev *dev) { }
--
1.7.9.5
Wei Yang
2014-07-24 06:22:13 UTC
Permalink
The sriov resource alignment is designed to be the individual size of a sriov
resource. This works fine for many platforms, but on powernv platform it needs
some change.

The original alignment works, since at sizing and assigning stage the
requirement is from an individual VF's resource size instead of the big IOV
BAR. This is the reason for the original code to just retrieve the individual
sriov size as the alignment.

On powernv platform, it is required to align the whole IOV BAR to a hardware
aperture. Based on this fact, the alignment of sriov resource should be the
total size of the IOV BAR.

This patch introduces a weak pcibios_sriov_resource_alignment() interface, which
gives platform a chance to implement specific method to calculate the sriov
resource alignment.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
drivers/pci/iov.c | 11 ++++++++++-
include/linux/pci.h | 3 +++
2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index ef1c546..d395769 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -505,6 +505,12 @@ int pci_iov_resource_bar(struct pci_dev *dev, int resno,
4 * (resno - PCI_IOV_RESOURCES);
}

+resource_size_t __weak pcibios_sriov_resource_alignment(struct pci_dev *dev,
+ int resno, resource_size_t align)
+{
+ return align;
+}
+
/**
* pci_sriov_resource_alignment - get resource alignment for VF BAR
* @dev: the PCI device
@@ -519,13 +525,16 @@ resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno)
{
struct resource tmp;
enum pci_bar_type type;
+ resource_size_t align;
int reg = pci_iov_resource_bar(dev, resno, &type);

if (!reg)
return 0;

__pci_read_base(dev, type, &tmp, reg);
- return resource_alignment(&tmp);
+ align = resource_alignment(&tmp);
+
+ return pcibios_sriov_resource_alignment(dev, resno, align);
}

/**
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 194db52..541ef4eb43 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1166,6 +1166,9 @@ unsigned char pci_bus_max_busnr(struct pci_bus *bus);
void pci_setup_bridge(struct pci_bus *bus);
resource_size_t pcibios_window_alignment(struct pci_bus *bus,
unsigned long type);
+resource_size_t pcibios_sriov_resource_alignment(struct pci_dev *dev,
+ int resno,
+ resource_size_t align);

#define PCI_VGA_STATE_CHANGE_BRIDGE (1 << 0)
#define PCI_VGA_STATE_CHANGE_DECODES (1 << 1)
--
1.7.9.5
Wei Yang
2014-07-24 06:22:14 UTC
Permalink
At resource sizing/assigning stage, resources are divided into two lists,
requested list and additional list, while the alignement of the additional
IOV BAR is not taken into the sizeing and assigning procedure.

This is reasonable in the original implementation, since IOV BAR's alignment is
mostly the size of a PF BAR alignemt. This means the alignment is already taken
into consideration. While this rule may be violated on some platform.

This patch take the additional IOV BAR alignment in sizing and assigning stage
explicitly.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
drivers/pci/setup-bus.c | 68 +++++++++++++++++++++++++++++++++++++++++------
1 file changed, 60 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index a5a63ec..d83681f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -120,6 +120,28 @@ static resource_size_t get_res_add_size(struct list_head *head,
return 0;
}

+static resource_size_t get_res_add_align(struct list_head *head,
+ struct resource *res)
+{
+ struct pci_dev_resource *dev_res;
+
+ list_for_each_entry(dev_res, head, list) {
+ if (dev_res->res == res) {
+ int idx = res - &dev_res->dev->resource[0];
+
+ dev_printk(KERN_DEBUG, &dev_res->dev->dev,
+ "res[%d]=%pR get_res_add_align min_align %llx\n",
+ idx, dev_res->res,
+ (unsigned long long)dev_res->min_align);
+
+ return dev_res->min_align;
+ }
+ }
+
+ return 0;
+}
+
+
/* Sort resources by alignment */
static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
{
@@ -368,8 +390,9 @@ static void __assign_resources_sorted(struct list_head *head,
LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
- struct pci_dev_resource *dev_res, *tmp_res;
+ struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
unsigned long fail_type;
+ resource_size_t add_align, align;

/* Check if optional add_size is there */
if (!realloc_head || list_empty(realloc_head))
@@ -384,10 +407,31 @@ static void __assign_resources_sorted(struct list_head *head,
}

/* Update res in head list with add_size in realloc_head list */
- list_for_each_entry(dev_res, head, list)
+ list_for_each_entry_safe(dev_res, tmp_res, head, list) {
dev_res->res->end += get_res_add_size(realloc_head,
dev_res->res);

+ if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
+ continue;
+
+ add_align = get_res_add_align(realloc_head, dev_res->res);
+
+ if (add_align > dev_res->res->start) {
+ dev_res->res->start = add_align;
+ dev_res->res->end = add_align +
+ resource_size(dev_res->res);
+
+ list_for_each_entry(dev_res2, head, list) {
+ align = pci_resource_alignment(dev_res2->dev,
+ dev_res2->res);
+ if (add_align > align)
+ list_move_tail(&dev_res->list,
+ &dev_res2->list);
+ }
+ }
+
+ }
+
/* Try updated head list with add_size added */
assign_requested_resources_sorted(head, &local_fail_head);

@@ -930,6 +974,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
struct resource *b_res = find_free_bus_resource(bus,
mask | IORESOURCE_PREFETCH, type);
resource_size_t children_add_size = 0;
+ resource_size_t children_add_align = 0;
+ resource_size_t add_align = 0;

if (!b_res)
return -ENOSPC;
@@ -954,6 +1000,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
/* put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
i <= PCI_IOV_RESOURCE_END) {
+ add_align = max(pci_resource_alignment(dev, r), add_align);
r->end = r->start - 1;
add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
children_add_size += r_size;
@@ -984,8 +1031,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
if (order > max_order)
max_order = order;

- if (realloc_head)
+ if (realloc_head) {
children_add_size += get_res_add_size(realloc_head, r);
+ children_add_align = get_res_add_align(realloc_head, r);
+ add_align = max(add_align, children_add_align);
+ }
}
}

@@ -996,7 +1046,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
add_size = children_add_size;
size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
calculate_memsize(size, min_size, add_size,
- resource_size(b_res), min_align);
+ resource_size(b_res), max(min_align, add_align));
if (!size0 && !size1) {
if (b_res->start || b_res->end)
dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n",
@@ -1008,10 +1058,12 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
b_res->end = size0 + min_align - 1;
b_res->flags |= IORESOURCE_STARTALIGN;
if (size1 > size0 && realloc_head) {
- add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
- dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n",
- b_res, &bus->busn_res,
- (unsigned long long)size1-size0);
+ add_to_list(realloc_head, bus->self, b_res, size1-size0,
+ max(min_align, add_align));
+ dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window "
+ "%pR to %pR add_size %llx add_align %llx\n", b_res,
+ &bus->busn_res, (unsigned long long)size1-size0,
+ max(min_align, add_align));
}
return 0;
}
--
1.7.9.5
Bjorn Helgaas
2014-08-20 03:08:41 UTC
Permalink
Post by Wei Yang
At resource sizing/assigning stage, resources are divided into two lists,
requested list and additional list, while the alignement of the additional
IOV BAR is not taken into the sizeing and assigning procedure.
This is reasonable in the original implementation, since IOV BAR's alignment is
mostly the size of a PF BAR alignemt. This means the alignment is already taken
into consideration. While this rule may be violated on some platform.
This patch take the additional IOV BAR alignment in sizing and assigning stage
explicitly.
---
drivers/pci/setup-bus.c | 68 +++++++++++++++++++++++++++++++++++++++++------
1 file changed, 60 insertions(+), 8 deletions(-)
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index a5a63ec..d83681f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -120,6 +120,28 @@ static resource_size_t get_res_add_size(struct list_head *head,
return 0;
}
+static resource_size_t get_res_add_align(struct list_head *head,
+ struct resource *res)
+{
+ struct pci_dev_resource *dev_res;
+
+ list_for_each_entry(dev_res, head, list) {
+ if (dev_res->res == res) {
+ int idx = res - &dev_res->dev->resource[0];
+
+ dev_printk(KERN_DEBUG, &dev_res->dev->dev,
+ "res[%d]=%pR get_res_add_align min_align %llx\n",
+ idx, dev_res->res,
+ (unsigned long long)dev_res->min_align);
+
+ return dev_res->min_align;
+ }
+ }
+
+ return 0;
+}
I see that you copied the structure of the existing get_res_add_size()
here. But I don't understand *that* function. It looks basically like
this:

resource_size_t get_res_add_size(list, res)
{
list_for_each_entry(dev_res, head, list) {
if (dev_res->res == res)
return dev_res->add_size;
}
return 0;
}

and we call it like this:

dev_res->res->end += get_res_add_size(realloc_head, dev_res->res);

So we start out with dev_res", pass in dev_res->res, search the
realloc_head list to find dev_res again, and return dev_res->add_size.
That looks equivalent to just:

dev_res->res->end += dev_res->add_size;

It looks like get_res_add_size() merely adds a printk and some complexity.
Am I missing something?

I do see that there are other callers where we don't actually start with
dev_res, which makes it a little more complicated. But I think you should
either add something like this:

struct pci_dev_resource *res_to_dev_res(list, res)
{
list_for_each_entry(dev_res, head, list) {
if (dev_res->res == res)
return dev_res;
}
return NULL;
}

which can be used to replace get_res_add_size() and get_res_add_align(), OR
figure out whether the dev_res of interest is always one we've just added.
If it is, maybe you can just make add_to_list() return the dev_res pointer
instead of an errno, and hang onto the pointer. I'd like that much better
if that's possible.
Post by Wei Yang
+
+
/* Sort resources by alignment */
static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
{
@@ -368,8 +390,9 @@ static void __assign_resources_sorted(struct list_head *head,
LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
- struct pci_dev_resource *dev_res, *tmp_res;
+ struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
unsigned long fail_type;
+ resource_size_t add_align, align;
/* Check if optional add_size is there */
if (!realloc_head || list_empty(realloc_head))
@@ -384,10 +407,31 @@ static void __assign_resources_sorted(struct list_head *head,
}
/* Update res in head list with add_size in realloc_head list */
- list_for_each_entry(dev_res, head, list)
+ list_for_each_entry_safe(dev_res, tmp_res, head, list) {
dev_res->res->end += get_res_add_size(realloc_head,
dev_res->res);
+ if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
+ continue;
+
+ add_align = get_res_add_align(realloc_head, dev_res->res);
+
+ if (add_align > dev_res->res->start) {
+ dev_res->res->start = add_align;
+ dev_res->res->end = add_align +
+ resource_size(dev_res->res);
+
+ list_for_each_entry(dev_res2, head, list) {
+ align = pci_resource_alignment(dev_res2->dev,
+ dev_res2->res);
+ if (add_align > align)
+ list_move_tail(&dev_res->list,
+ &dev_res2->list);
+ }
+ }
+
+ }
+
/* Try updated head list with add_size added */
assign_requested_resources_sorted(head, &local_fail_head);
@@ -930,6 +974,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
struct resource *b_res = find_free_bus_resource(bus,
mask | IORESOURCE_PREFETCH, type);
resource_size_t children_add_size = 0;
+ resource_size_t children_add_align = 0;
+ resource_size_t add_align = 0;
if (!b_res)
return -ENOSPC;
@@ -954,6 +1000,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
/* put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
i <= PCI_IOV_RESOURCE_END) {
+ add_align = max(pci_resource_alignment(dev, r), add_align);
r->end = r->start - 1;
add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
children_add_size += r_size;
@@ -984,8 +1031,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
if (order > max_order)
max_order = order;
- if (realloc_head)
+ if (realloc_head) {
children_add_size += get_res_add_size(realloc_head, r);
+ children_add_align = get_res_add_align(realloc_head, r);
+ add_align = max(add_align, children_add_align);
+ }
}
}
@@ -996,7 +1046,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
add_size = children_add_size;
calculate_memsize(size, min_size, add_size,
- resource_size(b_res), min_align);
+ resource_size(b_res), max(min_align, add_align));
if (!size0 && !size1) {
if (b_res->start || b_res->end)
dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n",
@@ -1008,10 +1058,12 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
b_res->end = size0 + min_align - 1;
b_res->flags |= IORESOURCE_STARTALIGN;
if (size1 > size0 && realloc_head) {
- add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
- dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n",
- b_res, &bus->busn_res,
- (unsigned long long)size1-size0);
+ add_to_list(realloc_head, bus->self, b_res, size1-size0,
+ max(min_align, add_align));
+ dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window "
+ "%pR to %pR add_size %llx add_align %llx\n", b_res,
+ &bus->busn_res, (unsigned long long)size1-size0,
+ max(min_align, add_align));
Factor out this "max(min_align, add_align)" thing so we don't have to
change these lines. Bonus points if you can also factor it out of the
calculate_memsize() call above. That one is a pretty complicated ternary
expression that should probably be turned into an "if" instead anyway.
Post by Wei Yang
}
return 0;
}
--
1.7.9.5
Wei Yang
2014-08-20 06:14:02 UTC
Permalink
Post by Bjorn Helgaas
Post by Wei Yang
At resource sizing/assigning stage, resources are divided into two lists,
requested list and additional list, while the alignement of the additional
IOV BAR is not taken into the sizeing and assigning procedure.
This is reasonable in the original implementation, since IOV BAR's alignment is
mostly the size of a PF BAR alignemt. This means the alignment is already taken
into consideration. While this rule may be violated on some platform.
This patch take the additional IOV BAR alignment in sizing and assigning stage
explicitly.
---
drivers/pci/setup-bus.c | 68 +++++++++++++++++++++++++++++++++++++++++------
1 file changed, 60 insertions(+), 8 deletions(-)
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index a5a63ec..d83681f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -120,6 +120,28 @@ static resource_size_t get_res_add_size(struct list_head *head,
return 0;
}
+static resource_size_t get_res_add_align(struct list_head *head,
+ struct resource *res)
+{
+ struct pci_dev_resource *dev_res;
+
+ list_for_each_entry(dev_res, head, list) {
+ if (dev_res->res == res) {
+ int idx = res - &dev_res->dev->resource[0];
+
+ dev_printk(KERN_DEBUG, &dev_res->dev->dev,
+ "res[%d]=%pR get_res_add_align min_align %llx\n",
+ idx, dev_res->res,
+ (unsigned long long)dev_res->min_align);
+
+ return dev_res->min_align;
+ }
+ }
+
+ return 0;
+}
I see that you copied the structure of the existing get_res_add_size()
here. But I don't understand *that* function. It looks basically like
resource_size_t get_res_add_size(list, res)
{
list_for_each_entry(dev_res, head, list) {
if (dev_res->res == res)
return dev_res->add_size;
}
return 0;
}
dev_res->res->end += get_res_add_size(realloc_head, dev_res->res);
So we start out with dev_res", pass in dev_res->res, search the
realloc_head list to find dev_res again, and return dev_res->add_size.
dev_res->res->end += dev_res->add_size;
It looks like get_res_add_size() merely adds a printk and some complexity.
Am I missing something?
Let me try to explain it, if not correct, please let know :-)

dev_res->res->end += get_res_add_size(realloc_head, dev_res->res);

would be expanded to:

dev_res->res->end += dev_res_1->add_size;

with the dev_res_1 is another one from dev_res which is stored in realloc_head.
Post by Bjorn Helgaas
I do see that there are other callers where we don't actually start with
dev_res, which makes it a little more complicated. But I think you should
struct pci_dev_resource *res_to_dev_res(list, res)
{
list_for_each_entry(dev_res, head, list) {
if (dev_res->res == res)
return dev_res;
}
return NULL;
}
Ok, we can extract the common part of these two functions.
Post by Bjorn Helgaas
which can be used to replace get_res_add_size() and get_res_add_align(), OR
figure out whether the dev_res of interest is always one we've just added.
If it is, maybe you can just make add_to_list() return the dev_res pointer
instead of an errno, and hang onto the pointer. I'd like that much better
if that's possible.
Sorry, I don't get this point.

add_to_list() is used to create the pci_dev_resource list, get_res_add_size()
and get_res_add_align() is to retrieve the information in the list. I am not
sure how to leverage add_to_list() in these two functions?
Post by Bjorn Helgaas
Post by Wei Yang
+
+
/* Sort resources by alignment */
static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
{
@@ -368,8 +390,9 @@ static void __assign_resources_sorted(struct list_head *head,
LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
- struct pci_dev_resource *dev_res, *tmp_res;
+ struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
unsigned long fail_type;
+ resource_size_t add_align, align;
/* Check if optional add_size is there */
if (!realloc_head || list_empty(realloc_head))
@@ -384,10 +407,31 @@ static void __assign_resources_sorted(struct list_head *head,
}
/* Update res in head list with add_size in realloc_head list */
- list_for_each_entry(dev_res, head, list)
+ list_for_each_entry_safe(dev_res, tmp_res, head, list) {
dev_res->res->end += get_res_add_size(realloc_head,
dev_res->res);
+ if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
+ continue;
+
+ add_align = get_res_add_align(realloc_head, dev_res->res);
+
+ if (add_align > dev_res->res->start) {
+ dev_res->res->start = add_align;
+ dev_res->res->end = add_align +
+ resource_size(dev_res->res);
+
+ list_for_each_entry(dev_res2, head, list) {
+ align = pci_resource_alignment(dev_res2->dev,
+ dev_res2->res);
+ if (add_align > align)
+ list_move_tail(&dev_res->list,
+ &dev_res2->list);
+ }
+ }
+
+ }
+
/* Try updated head list with add_size added */
assign_requested_resources_sorted(head, &local_fail_head);
@@ -930,6 +974,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
struct resource *b_res = find_free_bus_resource(bus,
mask | IORESOURCE_PREFETCH, type);
resource_size_t children_add_size = 0;
+ resource_size_t children_add_align = 0;
+ resource_size_t add_align = 0;
if (!b_res)
return -ENOSPC;
@@ -954,6 +1000,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
/* put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
i <= PCI_IOV_RESOURCE_END) {
+ add_align = max(pci_resource_alignment(dev, r), add_align);
r->end = r->start - 1;
add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
children_add_size += r_size;
@@ -984,8 +1031,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
if (order > max_order)
max_order = order;
- if (realloc_head)
+ if (realloc_head) {
children_add_size += get_res_add_size(realloc_head, r);
+ children_add_align = get_res_add_align(realloc_head, r);
+ add_align = max(add_align, children_add_align);
+ }
}
}
@@ -996,7 +1046,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
add_size = children_add_size;
calculate_memsize(size, min_size, add_size,
- resource_size(b_res), min_align);
+ resource_size(b_res), max(min_align, add_align));
if (!size0 && !size1) {
if (b_res->start || b_res->end)
dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n",
@@ -1008,10 +1058,12 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
b_res->end = size0 + min_align - 1;
b_res->flags |= IORESOURCE_STARTALIGN;
if (size1 > size0 && realloc_head) {
- add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
- dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n",
- b_res, &bus->busn_res,
- (unsigned long long)size1-size0);
+ add_to_list(realloc_head, bus->self, b_res, size1-size0,
+ max(min_align, add_align));
+ dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window "
+ "%pR to %pR add_size %llx add_align %llx\n", b_res,
+ &bus->busn_res, (unsigned long long)size1-size0,
+ max(min_align, add_align));
Factor out this "max(min_align, add_align)" thing so we don't have to
change these lines. Bonus points if you can also factor it out of the
calculate_memsize() call above. That one is a pretty complicated ternary
expression that should probably be turned into an "if" instead anyway.
Ok, I get your point. Let me make it more easy to read.
Post by Bjorn Helgaas
Post by Wei Yang
}
return 0;
}
--
1.7.9.5
--
Richard Yang
Help you, Help me
Wei Yang
2014-08-28 02:34:02 UTC
Permalink
Bjorn,

Is my understanding correct? Could I send another version based on your
comment, so that we can see it meets your requirement?
Post by Wei Yang
Post by Bjorn Helgaas
Post by Wei Yang
At resource sizing/assigning stage, resources are divided into two lists,
requested list and additional list, while the alignement of the additional
IOV BAR is not taken into the sizeing and assigning procedure.
This is reasonable in the original implementation, since IOV BAR's alignment is
mostly the size of a PF BAR alignemt. This means the alignment is already taken
into consideration. While this rule may be violated on some platform.
This patch take the additional IOV BAR alignment in sizing and assigning stage
explicitly.
---
drivers/pci/setup-bus.c | 68 +++++++++++++++++++++++++++++++++++++++++------
1 file changed, 60 insertions(+), 8 deletions(-)
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index a5a63ec..d83681f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -120,6 +120,28 @@ static resource_size_t get_res_add_size(struct list_head *head,
return 0;
}
+static resource_size_t get_res_add_align(struct list_head *head,
+ struct resource *res)
+{
+ struct pci_dev_resource *dev_res;
+
+ list_for_each_entry(dev_res, head, list) {
+ if (dev_res->res == res) {
+ int idx = res - &dev_res->dev->resource[0];
+
+ dev_printk(KERN_DEBUG, &dev_res->dev->dev,
+ "res[%d]=%pR get_res_add_align min_align %llx\n",
+ idx, dev_res->res,
+ (unsigned long long)dev_res->min_align);
+
+ return dev_res->min_align;
+ }
+ }
+
+ return 0;
+}
I see that you copied the structure of the existing get_res_add_size()
here. But I don't understand *that* function. It looks basically like
resource_size_t get_res_add_size(list, res)
{
list_for_each_entry(dev_res, head, list) {
if (dev_res->res == res)
return dev_res->add_size;
}
return 0;
}
dev_res->res->end += get_res_add_size(realloc_head, dev_res->res);
So we start out with dev_res", pass in dev_res->res, search the
realloc_head list to find dev_res again, and return dev_res->add_size.
dev_res->res->end += dev_res->add_size;
It looks like get_res_add_size() merely adds a printk and some complexity.
Am I missing something?
Let me try to explain it, if not correct, please let know :-)
dev_res->res->end += get_res_add_size(realloc_head, dev_res->res);
dev_res->res->end += dev_res_1->add_size;
with the dev_res_1 is another one from dev_res which is stored in realloc_head.
Post by Bjorn Helgaas
I do see that there are other callers where we don't actually start with
dev_res, which makes it a little more complicated. But I think you should
struct pci_dev_resource *res_to_dev_res(list, res)
{
list_for_each_entry(dev_res, head, list) {
if (dev_res->res == res)
return dev_res;
}
return NULL;
}
Ok, we can extract the common part of these two functions.
Post by Bjorn Helgaas
which can be used to replace get_res_add_size() and get_res_add_align(), OR
figure out whether the dev_res of interest is always one we've just added.
If it is, maybe you can just make add_to_list() return the dev_res pointer
instead of an errno, and hang onto the pointer. I'd like that much better
if that's possible.
Sorry, I don't get this point.
add_to_list() is used to create the pci_dev_resource list, get_res_add_size()
and get_res_add_align() is to retrieve the information in the list. I am not
sure how to leverage add_to_list() in these two functions?
Post by Bjorn Helgaas
Post by Wei Yang
+
+
/* Sort resources by alignment */
static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
{
@@ -368,8 +390,9 @@ static void __assign_resources_sorted(struct list_head *head,
LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
- struct pci_dev_resource *dev_res, *tmp_res;
+ struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
unsigned long fail_type;
+ resource_size_t add_align, align;
/* Check if optional add_size is there */
if (!realloc_head || list_empty(realloc_head))
@@ -384,10 +407,31 @@ static void __assign_resources_sorted(struct list_head *head,
}
/* Update res in head list with add_size in realloc_head list */
- list_for_each_entry(dev_res, head, list)
+ list_for_each_entry_safe(dev_res, tmp_res, head, list) {
dev_res->res->end += get_res_add_size(realloc_head,
dev_res->res);
+ if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
+ continue;
+
+ add_align = get_res_add_align(realloc_head, dev_res->res);
+
+ if (add_align > dev_res->res->start) {
+ dev_res->res->start = add_align;
+ dev_res->res->end = add_align +
+ resource_size(dev_res->res);
+
+ list_for_each_entry(dev_res2, head, list) {
+ align = pci_resource_alignment(dev_res2->dev,
+ dev_res2->res);
+ if (add_align > align)
+ list_move_tail(&dev_res->list,
+ &dev_res2->list);
+ }
+ }
+
+ }
+
/* Try updated head list with add_size added */
assign_requested_resources_sorted(head, &local_fail_head);
@@ -930,6 +974,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
struct resource *b_res = find_free_bus_resource(bus,
mask | IORESOURCE_PREFETCH, type);
resource_size_t children_add_size = 0;
+ resource_size_t children_add_align = 0;
+ resource_size_t add_align = 0;
if (!b_res)
return -ENOSPC;
@@ -954,6 +1000,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
/* put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
i <= PCI_IOV_RESOURCE_END) {
+ add_align = max(pci_resource_alignment(dev, r), add_align);
r->end = r->start - 1;
add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
children_add_size += r_size;
@@ -984,8 +1031,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
if (order > max_order)
max_order = order;
- if (realloc_head)
+ if (realloc_head) {
children_add_size += get_res_add_size(realloc_head, r);
+ children_add_align = get_res_add_align(realloc_head, r);
+ add_align = max(add_align, children_add_align);
+ }
}
}
@@ -996,7 +1046,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
add_size = children_add_size;
calculate_memsize(size, min_size, add_size,
- resource_size(b_res), min_align);
+ resource_size(b_res), max(min_align, add_align));
if (!size0 && !size1) {
if (b_res->start || b_res->end)
dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n",
@@ -1008,10 +1058,12 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
b_res->end = size0 + min_align - 1;
b_res->flags |= IORESOURCE_STARTALIGN;
if (size1 > size0 && realloc_head) {
- add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
- dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n",
- b_res, &bus->busn_res,
- (unsigned long long)size1-size0);
+ add_to_list(realloc_head, bus->self, b_res, size1-size0,
+ max(min_align, add_align));
+ dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window "
+ "%pR to %pR add_size %llx add_align %llx\n", b_res,
+ &bus->busn_res, (unsigned long long)size1-size0,
+ max(min_align, add_align));
Factor out this "max(min_align, add_align)" thing so we don't have to
change these lines. Bonus points if you can also factor it out of the
calculate_memsize() call above. That one is a pretty complicated ternary
expression that should probably be turned into an "if" instead anyway.
Ok, I get your point. Let me make it more easy to read.
Post by Bjorn Helgaas
Post by Wei Yang
}
return 0;
}
--
1.7.9.5
--
Richard Yang
Help you, Help me
--
Richard Yang
Help you, Help me
Bjorn Helgaas
2014-09-09 20:09:46 UTC
Permalink
Post by Wei Yang
Post by Bjorn Helgaas
Post by Wei Yang
At resource sizing/assigning stage, resources are divided into two lists,
requested list and additional list, while the alignement of the additional
IOV BAR is not taken into the sizeing and assigning procedure.
This is reasonable in the original implementation, since IOV BAR's alignment is
mostly the size of a PF BAR alignemt. This means the alignment is already taken
into consideration. While this rule may be violated on some platform.
This patch take the additional IOV BAR alignment in sizing and assigning stage
explicitly.
---
drivers/pci/setup-bus.c | 68 +++++++++++++++++++++++++++++++++++++++++------
1 file changed, 60 insertions(+), 8 deletions(-)
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index a5a63ec..d83681f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -120,6 +120,28 @@ static resource_size_t get_res_add_size(struct list_head *head,
return 0;
}
+static resource_size_t get_res_add_align(struct list_head *head,
+ struct resource *res)
+{
+ struct pci_dev_resource *dev_res;
+
+ list_for_each_entry(dev_res, head, list) {
+ if (dev_res->res == res) {
+ int idx = res - &dev_res->dev->resource[0];
+
+ dev_printk(KERN_DEBUG, &dev_res->dev->dev,
+ "res[%d]=%pR get_res_add_align min_align %llx\n",
+ idx, dev_res->res,
+ (unsigned long long)dev_res->min_align);
+
+ return dev_res->min_align;
+ }
+ }
+
+ return 0;
+}
I see that you copied the structure of the existing get_res_add_size()
here. But I don't understand *that* function. It looks basically like
resource_size_t get_res_add_size(list, res)
{
list_for_each_entry(dev_res, head, list) {
if (dev_res->res == res)
return dev_res->add_size;
}
return 0;
}
dev_res->res->end += get_res_add_size(realloc_head, dev_res->res);
So we start out with dev_res", pass in dev_res->res, search the
realloc_head list to find dev_res again, and return dev_res->add_size.
dev_res->res->end += dev_res->add_size;
It looks like get_res_add_size() merely adds a printk and some complexity.
Am I missing something?
Let me try to explain it, if not correct, please let know :-)
dev_res->res->end += get_res_add_size(realloc_head, dev_res->res);
dev_res->res->end += dev_res_1->add_size;
with the dev_res_1 is another one from dev_res which is stored in realloc_head.
Yep, I see now.
Post by Wei Yang
Post by Bjorn Helgaas
I do see that there are other callers where we don't actually start with
dev_res, which makes it a little more complicated. But I think you should
struct pci_dev_resource *res_to_dev_res(list, res)
{
list_for_each_entry(dev_res, head, list) {
if (dev_res->res == res)
return dev_res;
}
return NULL;
}
Ok, we can extract the common part of these two functions.
Post by Bjorn Helgaas
which can be used to replace get_res_add_size() and get_res_add_align(), OR
figure out whether the dev_res of interest is always one we've just added.
If it is, maybe you can just make add_to_list() return the dev_res pointer
instead of an errno, and hang onto the pointer. I'd like that much better
if that's possible.
Sorry, I don't get this point.
Don't worry, it didn't make sense. I was thinking that we knew the
dev_res up front and didn't need to look it up, but that's not the
case.

Sorry it took me so long to respond to this; I'm a bit swamped dealing
with some regressions.

Bjorn
Post by Wei Yang
add_to_list() is used to create the pci_dev_resource list, get_res_add_size()
and get_res_add_align() is to retrieve the information in the list. I am not
sure how to leverage add_to_list() in these two functions?
Post by Bjorn Helgaas
Post by Wei Yang
+
+
/* Sort resources by alignment */
static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
{
@@ -368,8 +390,9 @@ static void __assign_resources_sorted(struct list_head *head,
LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
- struct pci_dev_resource *dev_res, *tmp_res;
+ struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
unsigned long fail_type;
+ resource_size_t add_align, align;
/* Check if optional add_size is there */
if (!realloc_head || list_empty(realloc_head))
@@ -384,10 +407,31 @@ static void __assign_resources_sorted(struct list_head *head,
}
/* Update res in head list with add_size in realloc_head list */
- list_for_each_entry(dev_res, head, list)
+ list_for_each_entry_safe(dev_res, tmp_res, head, list) {
dev_res->res->end += get_res_add_size(realloc_head,
dev_res->res);
+ if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
+ continue;
+
+ add_align = get_res_add_align(realloc_head, dev_res->res);
+
+ if (add_align > dev_res->res->start) {
+ dev_res->res->start = add_align;
+ dev_res->res->end = add_align +
+ resource_size(dev_res->res);
+
+ list_for_each_entry(dev_res2, head, list) {
+ align = pci_resource_alignment(dev_res2->dev,
+ dev_res2->res);
+ if (add_align > align)
+ list_move_tail(&dev_res->list,
+ &dev_res2->list);
+ }
+ }
+
+ }
+
/* Try updated head list with add_size added */
assign_requested_resources_sorted(head, &local_fail_head);
@@ -930,6 +974,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
struct resource *b_res = find_free_bus_resource(bus,
mask | IORESOURCE_PREFETCH, type);
resource_size_t children_add_size = 0;
+ resource_size_t children_add_align = 0;
+ resource_size_t add_align = 0;
if (!b_res)
return -ENOSPC;
@@ -954,6 +1000,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
/* put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
i <= PCI_IOV_RESOURCE_END) {
+ add_align = max(pci_resource_alignment(dev, r), add_align);
r->end = r->start - 1;
add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
children_add_size += r_size;
@@ -984,8 +1031,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
if (order > max_order)
max_order = order;
- if (realloc_head)
+ if (realloc_head) {
children_add_size += get_res_add_size(realloc_head, r);
+ children_add_align = get_res_add_align(realloc_head, r);
+ add_align = max(add_align, children_add_align);
+ }
}
}
@@ -996,7 +1046,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
add_size = children_add_size;
calculate_memsize(size, min_size, add_size,
- resource_size(b_res), min_align);
+ resource_size(b_res), max(min_align, add_align));
if (!size0 && !size1) {
if (b_res->start || b_res->end)
dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n",
@@ -1008,10 +1058,12 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
b_res->end = size0 + min_align - 1;
b_res->flags |= IORESOURCE_STARTALIGN;
if (size1 > size0 && realloc_head) {
- add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
- dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n",
- b_res, &bus->busn_res,
- (unsigned long long)size1-size0);
+ add_to_list(realloc_head, bus->self, b_res, size1-size0,
+ max(min_align, add_align));
+ dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window "
+ "%pR to %pR add_size %llx add_align %llx\n", b_res,
+ &bus->busn_res, (unsigned long long)size1-size0,
+ max(min_align, add_align));
Factor out this "max(min_align, add_align)" thing so we don't have to
change these lines. Bonus points if you can also factor it out of the
calculate_memsize() call above. That one is a pretty complicated ternary
expression that should probably be turned into an "if" instead anyway.
Ok, I get your point. Let me make it more easy to read.
Post by Bjorn Helgaas
Post by Wei Yang
}
return 0;
}
--
1.7.9.5
--
Richard Yang
Help you, Help me
Wei Yang
2014-09-10 03:27:43 UTC
Permalink
Post by Bjorn Helgaas
Post by Wei Yang
Post by Bjorn Helgaas
Post by Wei Yang
At resource sizing/assigning stage, resources are divided into two lists,
requested list and additional list, while the alignement of the additional
IOV BAR is not taken into the sizeing and assigning procedure.
This is reasonable in the original implementation, since IOV BAR's alignment is
mostly the size of a PF BAR alignemt. This means the alignment is already taken
into consideration. While this rule may be violated on some platform.
This patch take the additional IOV BAR alignment in sizing and assigning stage
explicitly.
---
drivers/pci/setup-bus.c | 68 +++++++++++++++++++++++++++++++++++++++++------
1 file changed, 60 insertions(+), 8 deletions(-)
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index a5a63ec..d83681f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -120,6 +120,28 @@ static resource_size_t get_res_add_size(struct list_head *head,
return 0;
}
+static resource_size_t get_res_add_align(struct list_head *head,
+ struct resource *res)
+{
+ struct pci_dev_resource *dev_res;
+
+ list_for_each_entry(dev_res, head, list) {
+ if (dev_res->res == res) {
+ int idx = res - &dev_res->dev->resource[0];
+
+ dev_printk(KERN_DEBUG, &dev_res->dev->dev,
+ "res[%d]=%pR get_res_add_align min_align %llx\n",
+ idx, dev_res->res,
+ (unsigned long long)dev_res->min_align);
+
+ return dev_res->min_align;
+ }
+ }
+
+ return 0;
+}
I see that you copied the structure of the existing get_res_add_size()
here. But I don't understand *that* function. It looks basically like
resource_size_t get_res_add_size(list, res)
{
list_for_each_entry(dev_res, head, list) {
if (dev_res->res == res)
return dev_res->add_size;
}
return 0;
}
dev_res->res->end += get_res_add_size(realloc_head, dev_res->res);
So we start out with dev_res", pass in dev_res->res, search the
realloc_head list to find dev_res again, and return dev_res->add_size.
dev_res->res->end += dev_res->add_size;
It looks like get_res_add_size() merely adds a printk and some complexity.
Am I missing something?
Let me try to explain it, if not correct, please let know :-)
dev_res->res->end += get_res_add_size(realloc_head, dev_res->res);
dev_res->res->end += dev_res_1->add_size;
with the dev_res_1 is another one from dev_res which is stored in realloc_head.
Yep, I see now.
Post by Wei Yang
Post by Bjorn Helgaas
I do see that there are other callers where we don't actually start with
dev_res, which makes it a little more complicated. But I think you should
struct pci_dev_resource *res_to_dev_res(list, res)
{
list_for_each_entry(dev_res, head, list) {
if (dev_res->res == res)
return dev_res;
}
return NULL;
}
Ok, we can extract the common part of these two functions.
Post by Bjorn Helgaas
which can be used to replace get_res_add_size() and get_res_add_align(), OR
figure out whether the dev_res of interest is always one we've just added.
If it is, maybe you can just make add_to_list() return the dev_res pointer
instead of an errno, and hang onto the pointer. I'd like that much better
if that's possible.
Sorry, I don't get this point.
Don't worry, it didn't make sense. I was thinking that we knew the
dev_res up front and didn't need to look it up, but that's not the
case.
Sorry it took me so long to respond to this; I'm a bit swamped dealing
with some regressions.
:-) Never mind, those regressions are with higher priority then this new
feature.

And I found some bugs in this version during the test, and will merge those
fixes in the next version.
Post by Bjorn Helgaas
Bjorn
Post by Wei Yang
add_to_list() is used to create the pci_dev_resource list, get_res_add_size()
and get_res_add_align() is to retrieve the information in the list. I am not
sure how to leverage add_to_list() in these two functions?
Post by Bjorn Helgaas
Post by Wei Yang
+
+
/* Sort resources by alignment */
static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head)
{
@@ -368,8 +390,9 @@ static void __assign_resources_sorted(struct list_head *head,
LIST_HEAD(save_head);
LIST_HEAD(local_fail_head);
struct pci_dev_resource *save_res;
- struct pci_dev_resource *dev_res, *tmp_res;
+ struct pci_dev_resource *dev_res, *tmp_res, *dev_res2;
unsigned long fail_type;
+ resource_size_t add_align, align;
/* Check if optional add_size is there */
if (!realloc_head || list_empty(realloc_head))
@@ -384,10 +407,31 @@ static void __assign_resources_sorted(struct list_head *head,
}
/* Update res in head list with add_size in realloc_head list */
- list_for_each_entry(dev_res, head, list)
+ list_for_each_entry_safe(dev_res, tmp_res, head, list) {
dev_res->res->end += get_res_add_size(realloc_head,
dev_res->res);
+ if (!(dev_res->res->flags & IORESOURCE_STARTALIGN))
+ continue;
+
+ add_align = get_res_add_align(realloc_head, dev_res->res);
+
+ if (add_align > dev_res->res->start) {
+ dev_res->res->start = add_align;
+ dev_res->res->end = add_align +
+ resource_size(dev_res->res);
+
+ list_for_each_entry(dev_res2, head, list) {
+ align = pci_resource_alignment(dev_res2->dev,
+ dev_res2->res);
+ if (add_align > align)
+ list_move_tail(&dev_res->list,
+ &dev_res2->list);
+ }
+ }
+
+ }
+
/* Try updated head list with add_size added */
assign_requested_resources_sorted(head, &local_fail_head);
@@ -930,6 +974,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
struct resource *b_res = find_free_bus_resource(bus,
mask | IORESOURCE_PREFETCH, type);
resource_size_t children_add_size = 0;
+ resource_size_t children_add_align = 0;
+ resource_size_t add_align = 0;
if (!b_res)
return -ENOSPC;
@@ -954,6 +1000,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
/* put SRIOV requested res to the optional list */
if (realloc_head && i >= PCI_IOV_RESOURCES &&
i <= PCI_IOV_RESOURCE_END) {
+ add_align = max(pci_resource_alignment(dev, r), add_align);
r->end = r->start - 1;
add_to_list(realloc_head, dev, r, r_size, 0/* don't care */);
children_add_size += r_size;
@@ -984,8 +1031,11 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
if (order > max_order)
max_order = order;
- if (realloc_head)
+ if (realloc_head) {
children_add_size += get_res_add_size(realloc_head, r);
+ children_add_align = get_res_add_align(realloc_head, r);
+ add_align = max(add_align, children_add_align);
+ }
}
}
@@ -996,7 +1046,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
add_size = children_add_size;
calculate_memsize(size, min_size, add_size,
- resource_size(b_res), min_align);
+ resource_size(b_res), max(min_align, add_align));
if (!size0 && !size1) {
if (b_res->start || b_res->end)
dev_info(&bus->self->dev, "disabling bridge window %pR to %pR (unused)\n",
@@ -1008,10 +1058,12 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
b_res->end = size0 + min_align - 1;
b_res->flags |= IORESOURCE_STARTALIGN;
if (size1 > size0 && realloc_head) {
- add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
- dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window %pR to %pR add_size %llx\n",
- b_res, &bus->busn_res,
- (unsigned long long)size1-size0);
+ add_to_list(realloc_head, bus->self, b_res, size1-size0,
+ max(min_align, add_align));
+ dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window "
+ "%pR to %pR add_size %llx add_align %llx\n", b_res,
+ &bus->busn_res, (unsigned long long)size1-size0,
+ max(min_align, add_align));
Factor out this "max(min_align, add_align)" thing so we don't have to
change these lines. Bonus points if you can also factor it out of the
calculate_memsize() call above. That one is a pretty complicated ternary
expression that should probably be turned into an "if" instead anyway.
Ok, I get your point. Let me make it more easy to read.
Post by Bjorn Helgaas
Post by Wei Yang
}
return 0;
}
--
1.7.9.5
--
Richard Yang
Help you, Help me
--
Richard Yang
Help you, Help me
Wei Yang
2014-07-24 06:22:16 UTC
Permalink
When driver remove a pci_dev, it will call pcibios_disable_device() which is
platform dependent. This gives flexibility to platforms.

This patch defines this weak function on powerpc architecture.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/machdep.h | 5 ++++-
arch/powerpc/kernel/pci-common.c | 8 ++++++++
2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index f92b0b5..3909d1b 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -238,7 +238,10 @@ struct machdep_calls {

/* Called when pci_enable_device() is called. Returns 0 to
* allow assignment/enabling of the device. */
- int (*pcibios_enable_device_hook)(struct pci_dev *);
+ int (*pcibios_enable_device_hook)(struct pci_dev *);
+
+ /* Called when pci_disable_device() is called. */
+ void (*pcibios_disable_device_hook)(struct pci_dev *);

/* Called after scan and before resource survey */
void (*pcibios_fixup_phb)(struct pci_controller *hose);
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index ee70f57..d38a330 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1457,6 +1457,14 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
return pci_enable_resources(dev, mask);
}

+void pcibios_disable_device(struct pci_dev *dev)
+{
+ if (ppc_md.pcibios_disable_device_hook)
+ ppc_md.pcibios_disable_device_hook(dev);
+
+ return;
+}
+
resource_size_t pcibios_io_space_offset(struct pci_controller *hose)
{
return (unsigned long) hose->io_base_virt - _IO_BASE;
--
1.7.9.5
Wei Yang
2014-07-24 06:22:15 UTC
Permalink
If we're going to reassign resources with flag PCI_REASSIGN_ALL_RSRC, all
resources will be cleaned out during device header fixup time and then get
reassigned by PCI core. However, the VF resources won't be reassigned and
thus, we shouldn't clean them out.

This patch adds a condition. If the pci_dev is a VF, skip the resource
unset process.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/kernel/pci-common.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index b49c72f..ee70f57 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -794,6 +794,10 @@ static void pcibios_fixup_resources(struct pci_dev *dev)
pci_name(dev));
return;
}
+
+ if (dev->is_virtfn)
+ return;
+
for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
struct resource *res = dev->resource + i;
struct pci_bus_region reg;
--
1.7.9.5
Wei Yang
2014-07-24 06:22:24 UTC
Permalink
On powrnv platform, resource position in M64 implies the PE# the resource
belongs to. In some particular case, adjustment of a resource is necessary to
locate it to a correct position in M64.

This patch introduce a function to shift the 'real' VF BAR address according to
an offset.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/platforms/powernv/pci-ioda.c | 31 +++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3aeb87b..18e2917 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -14,6 +14,7 @@
#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/crash_dump.h>
+#include <linux/pci_regs.h>
#include <linux/debugfs.h>
#include <linux/delay.h>
#include <linux/string.h>
@@ -730,6 +731,36 @@ static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
return 10;
}

+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
+{
+ struct pci_dn *pdn = pci_get_pdn(dev);
+ int i;
+ struct resource *res;
+ resource_size_t size;
+
+ if (!dev->is_physfn)
+ return;
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = dev->resource + PCI_IOV_RESOURCES + i;
+ if (!res->flags || !res->parent)
+ continue;
+
+ if (!pnv_pci_is_mem_pref_64(res->flags))
+ continue;
+
+ dev_info(&dev->dev, "PowerNV: Shifting VF BAR %pR to\n", res);
+ size = pnv_pci_sriov_resource_size(dev, PCI_IOV_RESOURCES + i);
+ res->start += size*offset;
+
+ dev_info(&dev->dev, " %pR\n", res);
+ pci_update_resource(dev, PCI_IOV_RESOURCES + i);
+ }
+ pdn->vfs -= offset;
+}
+#endif /* CONFIG_PCI_IOV */
+
#if 0
static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
{
--
1.7.9.5
Wei Yang
2014-07-24 06:22:25 UTC
Permalink
VFs are created, when pci device is enabled.

This patch tries best to assign maximum resources and PEs for VF when pci
device is enabled. Enough M64 assigned to cover the IOV BAR, IOV BAR is
shifted to meet the PE# indicated by M64. VF's pdn->pdev and pdn->pe_number
are fixed.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/pci-bridge.h | 4 +
arch/powerpc/platforms/powernv/pci-ioda.c | 382 ++++++++++++++++++++++++++++-
arch/powerpc/platforms/powernv/pci.c | 20 ++
arch/powerpc/platforms/powernv/pci.h | 6 +
4 files changed, 399 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 3cb95af..8cabe8b 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -175,6 +175,10 @@ struct pci_dn {
int pe_number;
#ifdef CONFIG_PCI_IOV
u16 vfs; /* number of VFs IOV BAR expended */
+ u16 vf_pes;
+ int offset;
+#define IODA_INVALID_M64 (-1)
+ int m64_wins[PCI_SRIOV_NUM_BARS];
#endif /* CONFIG_PCI_IOV */
#endif
struct list_head child_list;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 18e2917..5bf67fe 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -42,6 +42,17 @@
#include "powernv.h"
#include "pci.h"

+#ifdef CONFIG_PCI_IOV
+#define VF_PE_LOG \
+ else if (pe->flags & PNV_IODA_PE_VF) \
+ sprintf(pfix, "%04x:%02x:%2x.%d", \
+ pci_domain_nr(pe->parent_dev->bus), \
+ (pe->rid & 0xff00) >> 8, \
+ PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
+#else /* CONFIG_PCI_IOV*/
+#define VF_PE_LOG
+#endif /* CONFIG_PCI_IOV*/
+
#define define_pe_printk_level(func, kern_level) \
static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...) \
{ \
@@ -55,13 +66,14 @@ static int func(const struct pnv_ioda_pe *pe, const char *fmt, ...) \
vaf.fmt = fmt; \
vaf.va = &args; \
\
- if (pe->pdev) \
+ if (pe->flags & PNV_IODA_PE_DEV) \
strlcpy(pfix, dev_name(&pe->pdev->dev), \
sizeof(pfix)); \
- else \
+ else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) \
sprintf(pfix, "%04x:%02x ", \
pci_domain_nr(pe->pbus), \
pe->pbus->number); \
+ VF_PE_LOG \
r = printk(kern_level "pci %s: [PE# %.3d] %pV", \
pfix, pe->pe_number, &vaf); \
\
@@ -542,7 +554,12 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
}
rid_end = pe->rid + (count << 8);
} else {
- parent = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+ if (pe->flags & PNV_IODA_PE_VF)
+ parent = pe->parent_dev;
+ else
+#endif /* CONFIG_PCI_IOV */
+ parent = pe->pdev->bus->self;
bcomp = OpalPciBusAll;
dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
@@ -589,6 +606,9 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)

pe->pbus = NULL;
pe->pdev = NULL;
+#ifdef CONFIG_PCI_IOV
+ pe->parent_dev = NULL;
+#endif /* CONFIG_PCI_IOV */

return 0;
}
@@ -628,7 +648,12 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
}
rid_end = pe->rid + (count << 8);
} else {
- parent = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+ if (pe->flags & PNV_IODA_PE_VF)
+ parent = pe->parent_dev;
+ else
+#endif /* CONFIG_PCI_IOV */
+ parent = pe->pdev->bus->self;
bcomp = OpalPciBusAll;
dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
@@ -965,6 +990,313 @@ static void pnv_pci_ioda_setup_PEs(void)
}
}

+#ifdef CONFIG_PCI_IOV
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ int i;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ if (pdn->m64_wins[i] == IODA_INVALID_M64)
+ continue;
+ opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 0);
+ clear_bit(pdn->m64_wins[i], &phb->ioda.m64_bar_alloc);
+ pdn->m64_wins[i] = IODA_INVALID_M64;
+ }
+
+ return 0;
+}
+
+static int pnv_pci_vf_assign_m64(struct pci_dev *pdev)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ unsigned int win;
+ struct resource *res;
+ int i;
+ int64_t rc;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ /* Initialize the m64_wins to IODA_INVALID_M64 */
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+ pdn->m64_wins[i] = IODA_INVALID_M64;
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = pdev->resource + PCI_IOV_RESOURCES + i;
+ if (!res->flags || !res->parent)
+ continue;
+
+ if (!pnv_pci_is_mem_pref_64(res->flags))
+ continue;
+
+ do {
+ win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+ phb->ioda.m64_bar_idx + 1, 0);
+
+ if (win >= phb->ioda.m64_bar_idx + 1)
+ goto m64_failed;
+ } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+ pdn->m64_wins[i] = win;
+
+ /* Map the M64 here */
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ pdn->m64_wins[i],
+ res->start,
+ 0, /* unused */
+ resource_size(res));
+ if (rc != OPAL_SUCCESS) {
+ pr_err("Failed to map M64 BAR #%d: %lld\n", win, rc);
+ goto m64_failed;
+ }
+
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 1);
+ if (rc != OPAL_SUCCESS) {
+ pr_err("Failed to enable M64 BAR #%d: %llx\n", win, rc);
+ goto m64_failed;
+ }
+ }
+ return 0;
+
+m64_failed:
+ pnv_pci_vf_release_m64(pdev);
+ return -EBUSY;
+}
+
+/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
+#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
+static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct iommu_table *tbl;
+ unsigned long addr;
+
+ bus = dev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ tbl = pe->tce32_table;
+ addr = tbl->it_base;
+
+ opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+ pe->pe_number << 1, 1, __pa(addr),
+ 0, 0x1000);
+
+ opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+ pe->pe_number,
+ (pe->pe_number << 1) + 1,
+ pe->tce_bypass_base,
+ 0);
+
+ iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
+ free_pages(addr, get_order(TCE32_TABLE_SIZE));
+ pe->tce32_table = NULL;
+}
+
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe, *pe_n;
+ struct pci_dn *pdn;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+
+ if (!pdev->is_physfn)
+ return;
+
+ pdn = pci_get_pdn(pdev);
+ list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
+ if (pe->parent_dev != pdev)
+ continue;
+
+ pnv_pci_ioda2_release_dma_pe(pdev, pe);
+
+ /* Remove from list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
+ list_del(&pe->list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
+ pnv_ioda_deconfigure_pe(phb, pe);
+
+ pnv_ioda_free_pe(phb, pe->pe_number);
+ }
+}
+
+void pnv_pci_sriov_disable(struct pci_dev *pdev)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ struct pci_sriov *iov;
+ u16 vf_num;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+ iov = pdev->sriov;
+ vf_num = pdn->vf_pes;
+
+ /* Release VF PEs */
+ pnv_ioda_release_vf_PE(pdev);
+
+ if (phb->type == PNV_PHB_IODA2) {
+ pnv_pci_vf_resource_shift(pdev, -pdn->offset);
+
+ /* Release M64 BARs */
+ pnv_pci_vf_release_m64(pdev);
+
+ /* Release PE numbers */
+ bitmap_clear(phb->ioda.pe_alloc, pdn->offset, vf_num);
+ pdn->offset = 0;
+ }
+
+ return;
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe);
+static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 vf_num)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
+ int pe_num;
+ u16 vf_index;
+ struct pci_dn *pdn;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ if (!pdev->is_physfn)
+ return;
+
+ /* Reserve PE for each VF */
+ for (vf_index = 0; vf_index < vf_num; vf_index++) {
+ pe_num = pdn->offset + vf_index;
+
+ pe = &phb->ioda.pe_array[pe_num];
+ pe->pe_number = pe_num;
+ pe->phb = phb;
+ pe->flags = PNV_IODA_PE_VF;
+ pe->pbus = NULL;
+ pe->parent_dev = pdev;
+ pe->tce32_seg = -1;
+ pe->mve_number = -1;
+ pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
+ pci_iov_virtfn_devfn(pdev, vf_index);
+
+ pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
+ hose->global_number, pdev->bus->number,
+ PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
+ PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ if (pe_num)
+ pnv_ioda_free_pe(phb, pe_num);
+ pe->pdev = NULL;
+ continue;
+ }
+
+ pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
+ GFP_KERNEL, hose->node);
+ pe->tce32_table->data = pe;
+
+ /* Put PE to the list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
+ list_add_tail(&pe->list, &phb->ioda.pe_list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
+ pnv_pci_ioda2_setup_dma_pe(phb, pe);
+
+ }
+}
+
+int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 vf_num)
+{
+ struct pci_bus *bus;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pci_dn *pdn;
+ int ret;
+
+ bus = pdev->bus;
+ hose = pci_bus_to_host(bus);
+ phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);
+
+ if (phb->type == PNV_PHB_IODA2) {
+ /* Calculate available PE for required VFs */
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
+try_again:
+ pdn->offset = bitmap_find_next_zero_area(
+ phb->ioda.pe_alloc, phb->ioda.total_pe,
+ 0, vf_num, 0);
+ if (pdn->offset >= phb->ioda.total_pe) {
+ vf_num--;
+ if (vf_num)
+ goto try_again;
+
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
+ pr_info("Failed to enable VF\n");
+ pdn->offset = 0;
+ return -EBUSY;
+ }
+ bitmap_set(phb->ioda.pe_alloc, pdn->offset, vf_num);
+ pdn->vf_pes = vf_num;
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
+
+ /* Assign M64 BAR accordingly */
+ ret = pnv_pci_vf_assign_m64(pdev);
+ if (ret) {
+ pr_info("No enough M64 resource\n");
+ goto m64_failed;
+ }
+
+ /* Do some magic shift */
+ pnv_pci_vf_resource_shift(pdev, pdn->offset);
+ }
+
+ /* Setup VF PEs */
+ pnv_ioda_setup_vf_PE(pdev, vf_num);
+
+ return 0;
+
+m64_failed:
+ bitmap_clear(phb->ioda.pe_alloc, pdn->offset, vf_num);
+ pdn->offset = 0;
+
+ return ret;
+}
+#endif /* CONFIG_PCI_IOV */
+
static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
{
struct pci_dn *pdn = pci_get_pdn(pdev);
@@ -1132,9 +1464,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
int64_t rc;
void *addr;

- /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
-#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
-
/* XXX FIXME: Handle 64-bit only DMA devices */
/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
/* XXX FIXME: Allocate multi-level tables on PHB3 */
@@ -1197,12 +1526,19 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
TCE_PCI_SWINV_PAIR);
}
iommu_init_table(tbl, phb->hose->node);
- iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);

- if (pe->pdev)
+ if (pe->flags & PNV_IODA_PE_DEV) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
- else
+ } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+ } else if (pe->flags & PNV_IODA_PE_VF) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
+ }

return;
fail:
@@ -1328,12 +1664,19 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
}
iommu_init_table(tbl, phb->hose->node);
- iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);

- if (pe->pdev)
+ if (pe->flags & PNV_IODA_PE_DEV) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
- else
+ } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+ } else if (pe->flags & PNV_IODA_PE_VF) {
+ iommu_register_group(tbl, phb->hose->global_number,
+ pe->pe_number);
+ }

/* Also create a bypass window */
pnv_pci_ioda2_setup_bypass_pe(phb, pe);
@@ -1838,9 +2181,19 @@ static int pnv_pci_enable_device_hook(struct pci_dev *dev)
if (!pdn || pdn->pe_number == IODA_INVALID_PE)
return -EINVAL;

+ if (dev->is_physfn)
+ pnv_pci_sriov_enable(dev, pci_sriov_get_totalvfs(dev));
return 0;
}

+static void pnv_pci_disable_device_hook(struct pci_dev *dev)
+{
+ if (dev->is_physfn)
+ pnv_pci_sriov_disable(dev);
+
+ return;
+}
+
static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
u32 devfn)
{
@@ -1906,6 +2259,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->hub_id = hub_id;
phb->opal_id = phb_id;
phb->type = ioda_type;
+ mutex_init(&phb->ioda.pe_alloc_mutex);

/* Detect specific models for error handling */
if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
@@ -1966,6 +2320,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np,

INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
INIT_LIST_HEAD(&phb->ioda.pe_list);
+ mutex_init(&phb->ioda.pe_list_mutex);

/* Calculate how many 32-bit TCE segments we have */
phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
@@ -2020,6 +2375,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np,
*/
ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
+ ppc_md.pcibios_disable_device_hook = pnv_pci_disable_device_hook;
ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
#ifdef CONFIG_PCI_IOV
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index cc7c9a6..c5036d5 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -724,6 +724,26 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
{
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
struct pnv_phb *phb = hose->private_data;
+#ifdef CONFIG_PCI_IOV
+ struct pnv_ioda_pe *pe;
+ struct pci_dn *pdn;
+
+ /* Fix the VF pdn PE number */
+ if (pdev->is_virtfn) {
+ pdn = pci_get_pdn(pdev);
+ if (pdn->pcidev == NULL || pdn->pe_number == IODA_INVALID_PE) {
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ if (pe->rid ==
+ ((pdev->bus->number << 8) | (pdev->devfn & 0xff))) {
+ pdn->pcidev = pdev;
+ pdn->pe_number = pe->pe_number;
+ pe->pdev = pdev;
+ break;
+ }
+ }
+ }
+ }
+#endif /* CONFIG_PCI_IOV */

/* If we have no phb structure, try to setup a fallback based on
* the device-tree (RTAS PCI for example)
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index e55772f..5ea36ef 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -23,6 +23,7 @@ enum pnv_phb_model {
#define PNV_IODA_PE_BUS_ALL (1 << 2) /* PE has subordinate buses */
#define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */
#define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */
+#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */

/* Data associated with a PE, including IOMMU tracking etc.. */
struct pnv_phb;
@@ -34,6 +35,9 @@ struct pnv_ioda_pe {
* entire bus (& children). In the former case, pdev
* is populated, in the later case, pbus is.
*/
+#ifdef CONFIG_PCI_IOV
+ struct pci_dev *parent_dev;
+#endif
struct pci_dev *pdev;
struct pci_bus *pbus;

@@ -161,6 +165,7 @@ struct pnv_phb {

/* PE allocation bitmap */
unsigned long *pe_alloc;
+ struct mutex pe_alloc_mutex;

/* M32 & IO segment maps */
unsigned int *m32_segmap;
@@ -175,6 +180,7 @@ struct pnv_phb {
* on the sequence of creation
*/
struct list_head pe_list;
+ struct mutex pe_list_mutex;

/* Reverse map of PEs, will have to extend if
* we are to support more than 256 PEs, indexed
--
1.7.9.5
Wei Yang
2014-07-24 06:22:17 UTC
Permalink
From: Gavin Shan <***@linux.vnet.ibm.com>

pci_dn is the extension of PCI device node and it's created from
device node. Unfortunately, VFs that are enabled dynamically by
PF's driver and they don't have corresponding device nodes, and
pci_dn. The patch refactors pci_dn to support VFs:

* pci_dn is organized as a hierarchy tree. VF's pci_dn is put
to the child list of pci_dn of PF's bridge. pci_dn of other
device put to the child list of pci_dn of its upstream bridge.

* VF's pci_dn is expected to be created dynamically when applying
final fixup to PF. VF's pci_dn will be destroyed when releasing
PF's pci_dev instance. pci_dn of other device is still created
from device node as before.

* For one particular PCI device (VF or not), its pci_dn can be
found from pdev->dev.archdata.firmware_data, PCI_DN(devnode),
or parent's list. The fast path (fetching pci_dn through PCI
device instance) is populated during early fixup time.

Signed-off-by: Gavin Shan <***@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/device.h | 3 +
arch/powerpc/include/asm/pci-bridge.h | 14 +-
arch/powerpc/kernel/pci-hotplug.c | 3 +
arch/powerpc/kernel/pci_dn.c | 248 ++++++++++++++++++++++++++++++++-
4 files changed, 263 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index 38faede..29992cd 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -34,6 +34,9 @@ struct dev_archdata {
#ifdef CONFIG_SWIOTLB
dma_addr_t max_direct_dma_addr;
#endif
+#ifdef CONFIG_PPC64
+ void *firmware_data;
+#endif
#ifdef CONFIG_EEH
struct eeh_dev *edev;
#endif
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 4ca90a3..757d7bb 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -89,6 +89,7 @@ struct pci_controller {

#ifdef CONFIG_PPC64
unsigned long buid;
+ void *firmware_data;
#endif /* CONFIG_PPC64 */

void *private_data;
@@ -150,9 +151,13 @@ static inline int isa_vaddr_is_ioport(void __iomem *address)
struct iommu_table;

struct pci_dn {
+ int flags;
+#define PCI_DN_FLAG_IOV_VF 0x01
+
int busno; /* pci bus number */
int devfn; /* pci device and function number */

+ struct pci_dn *parent;
struct pci_controller *phb; /* for pci devices */
struct iommu_table *iommu_table; /* for phb's or bridges */
struct device_node *node; /* back-pointer to the device_node */
@@ -169,14 +174,19 @@ struct pci_dn {
#ifdef CONFIG_PPC_POWERNV
int pe_number;
#endif
+ struct list_head child_list;
+ struct list_head list;
};

/* Get the pointer to a device_node's pci_dn */
#define PCI_DN(dn) ((struct pci_dn *) (dn)->data)

+extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
+ int devfn);
extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev);
-
-extern void * update_dn_pci_info(struct device_node *dn, void *data);
+extern struct pci_dn *add_dev_pci_info(struct pci_dev *pdev);
+extern void remove_dev_pci_info(struct pci_dev *pdev);
+extern void *update_dn_pci_info(struct device_node *dn, void *data);

static inline int pci_device_from_OF_node(struct device_node *np,
u8 *bus, u8 *devfn)
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 5b78917..af60efe 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -30,6 +30,9 @@
void pcibios_release_device(struct pci_dev *dev)
{
eeh_remove_device(dev);
+
+ /* Release firmware data */
+ remove_dev_pci_info(dev);
}

/**
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 1f61fab..9723108 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -32,12 +32,224 @@
#include <asm/ppc-pci.h>
#include <asm/firmware.h>

+/*
+ * The function is used to find the firmware data of one
+ * specific PCI device, which is attached to the indicated
+ * PCI bus. For VFs, their firmware data is linked to that
+ * one of PF's bridge. For other devices, their firmware
+ * data is linked to that of their bridge.
+ */
+static struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus)
+{
+ struct pci_bus *pbus;
+ struct device_node *dn;
+ struct pci_dn *pdn;
+
+ /*
+ * We probably have virtual bus which doesn't
+ * have associated bridge.
+ */
+ pbus = bus;
+ while (pbus) {
+ if (pci_is_root_bus(pbus) || pbus->self)
+ break;
+
+ pbus = pbus->parent;
+ }
+
+ /*
+ * Except virtual bus, all PCI buses should
+ * have device nodes.
+ */
+ dn = pci_bus_to_OF_node(pbus);
+ pdn = dn ? PCI_DN(dn) : NULL;
+
+ return pdn;
+}
+
+struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus,
+ int devfn)
+{
+ struct device_node *dn;
+ struct pci_dn *parent, *pdn;
+ struct pci_dev *pdev = NULL;
+
+ /* We can't call pci_get_slot() from interrupt context */
+ if (in_interrupt())
+ goto slow_path;
+
+ /* Fast path: fetch from PCI device */
+ pdev = pci_get_slot(bus, devfn);
+ pdn = pdev ? pdev->dev.archdata.firmware_data : NULL;
+ pci_dev_put(pdev);
+ if (pdn)
+ return pdn;
+
+ /* Fast path: fetch from device node */
+ dn = pdev ? pci_device_to_OF_node(pdev) : NULL;
+ pdn = dn ? PCI_DN(dn) : NULL;
+ if (pdn)
+ return pdn;
+
+ /* Slow path: fetch from firmware data hierarchy */
+slow_path:
+ parent = pci_bus_to_pdn(bus);
+ if (!parent)
+ return NULL;
+
+ list_for_each_entry(pdn, &parent->child_list, list) {
+ if (pdn->busno == bus->number &&
+ pdn->devfn == devfn)
+ return pdn;
+ }
+
+ return NULL;
+}
+
struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
{
- struct device_node *dn = pci_device_to_OF_node(pdev);
- if (!dn)
+ struct device_node *dn;
+ struct pci_dn *parent, *pdn;
+
+ /* Search device directly */
+ if (pdev->dev.archdata.firmware_data)
+ return pdev->dev.archdata.firmware_data;
+
+ /* Check device node */
+ dn = pci_device_to_OF_node(pdev);
+ pdn = dn ? PCI_DN(dn) : NULL;
+ if (pdn)
+ return pdn;
+
+ /*
+ * VFs don't have device nodes. We hook their
+ * firmware data to PF's bridge.
+ */
+ parent = pci_bus_to_pdn(pdev->bus);
+ if (!parent)
return NULL;
- return PCI_DN(dn);
+
+ list_for_each_entry(pdn, &parent->child_list, list) {
+ if (pdn->busno == pdev->bus->number &&
+ pdn->devfn == pdev->devfn)
+ return pdn;
+ }
+
+ return NULL;
+}
+
+static struct pci_dn *add_one_dev_pci_info(struct pci_dn *parent,
+ struct pci_dev *pdev,
+ int busno, int devfn)
+{
+ struct pci_dn *pdn;
+
+ /* Except PHB, we always have parent firmware data */
+ if (!parent)
+ return NULL;
+
+ pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
+ if (!pdn) {
+ pr_warn("%s: Out of memory !\n", __func__);
+ return NULL;
+ }
+
+ pdn->phb = parent->phb;
+ pdn->parent = parent;
+ pdn->busno = busno;
+ pdn->devfn = devfn;
+#ifdef CONFIG_PPC_POWERNV
+ pdn->pe_number = IODA_INVALID_PE;
+#endif
+ INIT_LIST_HEAD(&pdn->child_list);
+ INIT_LIST_HEAD(&pdn->list);
+ list_add_tail(&pdn->list, &parent->child_list);
+
+ /*
+ * If we already have PCI device instance, lets
+ * bind them.
+ */
+ if (pdev)
+ pdev->dev.archdata.firmware_data = pdn;
+
+ return pdn;
+}
+
+struct pci_dn *add_dev_pci_info(struct pci_dev *pdev)
+{
+#ifdef CONFIG_PCI_IOV
+ struct pci_dn *parent, *pdn;
+ int i;
+
+ /* Only support IOV for now */
+ if (!pdev->is_physfn)
+ return pci_get_pdn(pdev);
+
+ /* Check if VFs have been populated */
+ pdn = pci_get_pdn(pdev);
+ if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF))
+ return NULL;
+
+ pdn->flags |= PCI_DN_FLAG_IOV_VF;
+ parent = pci_bus_to_pdn(pdev->bus);
+ if (!parent)
+ return NULL;
+
+ for (i = 0; i < pdev->sriov->total_VFs; i++) {
+ pdn = add_one_dev_pci_info(parent, NULL,
+ pci_iov_virtfn_bus(pdev, i),
+ pci_iov_virtfn_devfn(pdev, i));
+ if (!pdn) {
+ pr_warn("%s: Cannot create firmware data "
+ "for VF#%d of %s\n",
+ __func__, i, pci_name(pdev));
+ return NULL;
+ }
+ }
+#endif
+
+ return pci_get_pdn(pdev);
+}
+
+void remove_dev_pci_info(struct pci_dev *pdev)
+{
+#ifdef CONFIG_PCI_IOV
+ struct pci_dn *parent;
+ struct pci_dn *pdn, *tmp;
+ int i;
+
+ /* Only support IOV PF for now */
+ if (!pdev->is_physfn)
+ return;
+
+ /* Check if VFs have been populated */
+ pdn = pci_get_pdn(pdev);
+ if (!pdn || !(pdn->flags & PCI_DN_FLAG_IOV_VF))
+ return;
+
+ pdn->flags &= ~PCI_DN_FLAG_IOV_VF;
+ parent = pci_bus_to_pdn(pdev->bus);
+ if (!parent)
+ return;
+
+ /*
+ * We might introduce flag to pci_dn in future
+ * so that we can release VF's firmware data in
+ * a batch mode.
+ */
+ for (i = 0; i < pdev->sriov->total_VFs; i++) {
+ list_for_each_entry_safe(pdn, tmp,
+ &parent->child_list, list) {
+ if (pdn->busno != pci_iov_virtfn_bus(pdev, i) ||
+ pdn->devfn != pci_iov_virtfn_devfn(pdev, i))
+ continue;
+
+ if (!list_empty(&pdn->list))
+ list_del(&pdn->list);
+ kfree(pdn);
+ }
+ }
+#endif
}

/*
@@ -49,6 +261,7 @@ void *update_dn_pci_info(struct device_node *dn, void *data)
struct pci_controller *phb = data;
const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL);
const __be32 *regs;
+ struct device_node *parent;
struct pci_dn *pdn;

pdn = zalloc_maybe_bootmem(sizeof(*pdn), GFP_KERNEL);
@@ -70,6 +283,15 @@ void *update_dn_pci_info(struct device_node *dn, void *data)
}

pdn->pci_ext_config_space = (type && of_read_number(type, 1) == 1);
+
+ /* Attach to parent node */
+ INIT_LIST_HEAD(&pdn->child_list);
+ INIT_LIST_HEAD(&pdn->list);
+ parent = of_get_parent(dn);
+ pdn->parent = parent ? PCI_DN(parent) : NULL;
+ if (pdn->parent)
+ list_add_tail(&pdn->list, &pdn->parent->child_list);
+
return NULL;
}

@@ -150,6 +372,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb)
if (pdn) {
pdn->devfn = pdn->busno = -1;
pdn->phb = phb;
+ phb->firmware_data = pdn;
}

/* Update dn->phb ptrs for new phb and children devices */
@@ -173,3 +396,22 @@ void __init pci_devs_phb_init(void)
list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
pci_devs_phb_init_dynamic(phb);
}
+
+static void pci_dev_pdn_create(struct pci_dev *pdev)
+{
+ add_dev_pci_info(pdev);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_create);
+
+static void pci_dev_pdn_setup(struct pci_dev *pdev)
+{
+ struct pci_dn *pdn;
+
+ if (pdev->dev.archdata.firmware_data)
+ return;
+
+ /* Setup the fast path */
+ pdn = pci_get_pdn(pdev);
+ pdev->dev.archdata.firmware_data = pdn;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_setup);
--
1.7.9.5
Wei Yang
2014-07-24 06:22:20 UTC
Permalink
Current iommu_table of a PE is a static field. This will have a problem when
iommu_free_table is called.

This patch allocate iommu_table dynamically.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/iommu.h | 3 +++
arch/powerpc/platforms/powernv/pci-ioda.c | 26 ++++++++++++++------------
arch/powerpc/platforms/powernv/pci.h | 2 +-
3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 42632c7..0fedacb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -78,6 +78,9 @@ struct iommu_table {
struct iommu_group *it_group;
#endif
void (*set_bypass)(struct iommu_table *tbl, bool enable);
+#ifdef CONFIG_PPC_POWERNV
+ void *data;
+#endif
};

/* Pure 2^n version of get_order */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index b5082a2..d66a76b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -784,6 +784,10 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
return;
}

+ pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
+ GFP_KERNEL, hose->node);
+ pe->tce32_table->data = pe;
+
/* Associate it with all child devices */
pnv_ioda_setup_same_PE(bus, pe);

@@ -857,7 +861,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev

pe = &phb->ioda.pe_array[pdn->pe_number];
WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
- set_iommu_table_base(&pdev->dev, &pe->tce32_table);
+ set_iommu_table_base(&pdev->dev, pe->tce32_table);
}

static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
@@ -884,7 +888,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
} else {
dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
set_dma_ops(&pdev->dev, &dma_iommu_ops);
- set_iommu_table_base(&pdev->dev, &pe->tce32_table);
+ set_iommu_table_base(&pdev->dev, pe->tce32_table);
}
return 0;
}
@@ -898,9 +902,9 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
list_for_each_entry(dev, &bus->devices, bus_list) {
if (add_to_iommu_group)
set_iommu_table_base_and_group(&dev->dev,
- &pe->tce32_table);
+ pe->tce32_table);
else
- set_iommu_table_base(&dev->dev, &pe->tce32_table);
+ set_iommu_table_base(&dev->dev, pe->tce32_table);

if (dev->subordinate)
pnv_ioda_setup_bus_dma(pe, dev->subordinate,
@@ -988,8 +992,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
__be64 *startp, __be64 *endp, bool rm)
{
- struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
- tce32_table);
+ struct pnv_ioda_pe *pe = tbl->data;
struct pnv_phb *phb = pe->phb;

if (phb->type == PNV_PHB_IODA1)
@@ -1055,7 +1058,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
}

/* Setup linux iommu table */
- tbl = &pe->tce32_table;
+ tbl = pe->tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
base << 28);

@@ -1093,8 +1096,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,

static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
{
- struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
- tce32_table);
+ struct pnv_ioda_pe *pe = tbl->data;
uint16_t window_id = (pe->pe_number << 1 ) + 1;
int64_t rc;

@@ -1139,10 +1141,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
pe->tce_bypass_base = 1ull << 59;

/* Install set_bypass callback for VFIO */
- pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass;
+ pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass;

/* Enable bypass by default */
- pnv_pci_ioda2_set_bypass(&pe->tce32_table, true);
+ pnv_pci_ioda2_set_bypass(pe->tce32_table, true);
}

static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
@@ -1190,7 +1192,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
}

/* Setup linux iommu table */
- tbl = &pe->tce32_table;
+ tbl = pe->tce32_table;
pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0);

/* OPAL variant of PHB3 invalidated TCEs */
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index bff26d9..0d616f0 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -53,7 +53,7 @@ struct pnv_ioda_pe {
/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
int tce32_seg;
int tce32_segcount;
- struct iommu_table tce32_table;
+ struct iommu_table *tce32_table;
phys_addr_t tce_inval_reg_phys;

/* 64-bit TCE bypass region */
--
1.7.9.5
Wei Yang
2014-07-24 06:22:22 UTC
Permalink
On PHB3, VF resources will be covered by M64 BAR to have better PE isolation.
Mostly the total_pe number is different from the total_VFs, which will lead to
a conflict between MMIO space and the PE number.

This patch expands the VF resource size to reserve total_pe number of VFs'
resource, which prevents the conflict.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/machdep.h | 4 ++
arch/powerpc/include/asm/pci-bridge.h | 3 +
arch/powerpc/kernel/pci-common.c | 5 ++
arch/powerpc/platforms/powernv/pci-ioda.c | 91 +++++++++++++++++++++++++++++
arch/powerpc/platforms/powernv/pci.h | 3 +
5 files changed, 106 insertions(+)

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 3909d1b..fabb8016 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -252,6 +252,10 @@ struct machdep_calls {
/* Reset the secondary bus of bridge */
void (*pcibios_reset_secondary_bus)(struct pci_dev *dev);

+#ifdef CONFIG_PCI_IOV
+ void (*pcibios_fixup_sriov)(struct pci_bus *bus);
+#endif /* CONFIG_PCI_IOV */
+
/* Called to shutdown machine specific hardware not already controlled
* by other drivers.
*/
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 757d7bb..3cb95af 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -173,6 +173,9 @@ struct pci_dn {
#define IODA_INVALID_PE (-1)
#ifdef CONFIG_PPC_POWERNV
int pe_number;
+#ifdef CONFIG_PCI_IOV
+ u16 vfs; /* number of VFs IOV BAR expended */
+#endif /* CONFIG_PCI_IOV */
#endif
struct list_head child_list;
struct list_head list;
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index d38a330..c2b7930 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1651,6 +1651,11 @@ void pcibios_scan_phb(struct pci_controller *hose)
if (ppc_md.pcibios_fixup_phb)
ppc_md.pcibios_fixup_phb(hose);

+#ifdef CONFIG_PCI_IOV
+ if (ppc_md.pcibios_fixup_sriov)
+ ppc_md.pcibios_fixup_sriov(bus);
+#endif /* CONFIG_PCI_IOV */
+
/* Configure PCI Express settings */
if (bus && !pci_has_flag(PCI_PROBE_ONLY)) {
struct pci_bus *child;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8318b07..6fd2377 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1505,6 +1505,60 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
#endif /* CONFIG_PCI_MSI */

+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct resource *res;
+ int i;
+ resource_size_t size;
+ struct pci_dn *pdn;
+
+ if (!pdev->is_physfn || pdev->is_added)
+ return;
+
+ hose = pci_bus_to_host(pdev->bus);
+ phb = hose->private_data;
+
+ pdn = pci_get_pdn(pdev);
+ pdn->vfs = 0;
+
+ for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) {
+ res = &pdev->resource[i];
+ if (!res->flags || res->parent)
+ continue;
+ if (!pnv_pci_is_mem_pref_64(res->flags)) {
+ dev_warn(&pdev->dev, " non M64 IOV BAR %pR on %s\n",
+ res, pci_name(pdev));
+ continue;
+ }
+
+ dev_dbg(&pdev->dev, "PowerNV: Fixing VF BAR[%d] %pR to\n",
+ i, res);
+ size = pnv_pci_sriov_resource_size(pdev, i);
+ res->end = res->start + size * phb->ioda.total_pe - 1;
+ dev_dbg(&pdev->dev, " %pR\n", res);
+ }
+ pdn->vfs = phb->ioda.total_pe;
+}
+
+static void pnv_pci_ioda_fixup_sriov(struct pci_bus *bus)
+{
+ struct pci_dev *pdev;
+ struct pci_bus *b;
+
+ list_for_each_entry(pdev, &bus->devices, bus_list) {
+ b = pdev->subordinate;
+
+ if (b)
+ pnv_pci_ioda_fixup_sriov(b);
+
+ pnv_pci_ioda_fixup_iov_resources(pdev);
+ }
+}
+#endif /* CONFIG_PCI_IOV */
+
/*
* This function is supposed to be called on basis of PE from top
* to bottom style. So the the I/O or MMIO segment assigned to
@@ -1681,6 +1735,40 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
return phb->ioda.io_segsize;
}

+/*
+ * Allocate firmware data for VF, which doesn't have corresponding
+ * device node. So we have to extend device's archdata.
+ */
+#ifdef CONFIG_PCI_IOV
+static resource_size_t pnv_pcibios_sriov_resource_size(struct pci_dev *pdev, int resno)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ resource_size_t size = 0;
+
+ if (!pdn->vfs)
+ return size;
+
+ size = resource_size(pdev->resource + resno);
+ do_div(size, pdn->vfs);
+
+ return size;
+}
+
+resource_size_t pnv_pci_sriov_resource_size(struct pci_dev *pdev, int resno)
+{
+ resource_size_t size;
+
+ size = pnv_pcibios_sriov_resource_size(pdev, resno);
+ if (size != 0)
+ return size;
+
+ size = resource_size(pdev->resource + resno);
+ do_div(size, pci_sriov_get_totalvfs(pdev));
+
+ return size;
+}
+#endif /* CONFIG_PCI_IOV */
+
/* Prevent enabling devices for which we couldn't properly
* assign a PE
*/
@@ -1886,6 +1974,9 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np,
ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook;
ppc_md.pcibios_window_alignment = pnv_pci_window_alignment;
ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
+#ifdef CONFIG_PCI_IOV
+ ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_sriov;
+#endif /* CONFIG_PCI_IOV */
pci_add_flags(PCI_REASSIGN_ALL_RSRC);

/* Reset IODA tables to a clean state */
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 0d616f0..e55772f 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -229,5 +229,8 @@ extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
__be64 *startp, __be64 *endp, bool rm);
extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
extern int ioda_eeh_phb_reset(struct pci_controller *hose, int option);
+#ifdef CONFIG_PCI_IOV
+resource_size_t pnv_pci_sriov_resource_size(struct pci_dev *pdev, int resno);
+#endif

#endif /* __POWERNV_PCI_H */
--
1.7.9.5
Wei Yang
2014-07-24 06:22:19 UTC
Permalink
On powernv platform, the IOV BAR size will be adjusted to meet the alignment
requirement from hardware. This leads to the VF resource size need to be
retrieved from hardware directly.

This patch adds this flag for IOV BAR on powernv platform.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/platforms/powernv/pci.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index b97aa79..cc7c9a6 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -21,6 +21,7 @@
#include <linux/io.h>
#include <linux/msi.h>
#include <linux/iommu.h>
+#include <uapi/linux/pci_regs.h>

#include <asm/sections.h>
#include <asm/io.h>
@@ -876,3 +877,20 @@ static int __init tce_iommu_bus_notifier_init(void)
}

subsys_initcall_sync(tce_iommu_bus_notifier_init);
+
+static void pnv_sriov_final_fixup(struct pci_dev *dev)
+{
+ struct resource *res;
+ int i;
+
+ if (!dev->is_physfn)
+ return;
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = dev->resource + PCI_IOV_RESOURCES + i;
+ if (!res->flags)
+ continue;
+ res->flags |= IORESOURCE_ARCH;
+ }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pnv_sriov_final_fixup);
--
1.7.9.5
Wei Yang
2014-07-24 06:22:26 UTC
Permalink
M64 aperture size is limited on PHB3. When the IOV BAR is too big, this will
exceed the limitation and failed to be assigned.

This patch introduce a different expanding based on the IOV BAR size:

IOV BAR size is smaller than 64M, expand to total_pe.
IOV BAR size is bigger than 64M, roundup power2.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/pci-bridge.h | 2 ++
arch/powerpc/platforms/powernv/pci-ioda.c | 31 +++++++++++++++++++++++++++--
2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 8cabe8b..9c2c826 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -177,6 +177,8 @@ struct pci_dn {
u16 vfs; /* number of VFs IOV BAR expended */
u16 vf_pes;
int offset;
+#define M64_PER_IOV 4
+ int m64_per_iov;
#define IODA_INVALID_M64 (-1)
int m64_wins[PCI_SRIOV_NUM_BARS];
#endif /* CONFIG_PCI_IOV */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5bf67fe..98fa01d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1888,6 +1888,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
int i;
resource_size_t size;
struct pci_dn *pdn;
+ int mul, total_vfs;

if (!pdev->is_physfn || pdev->is_added)
return;
@@ -1898,6 +1899,32 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
pdn = pci_get_pdn(pdev);
pdn->vfs = 0;

+ total_vfs = pci_sriov_get_totalvfs(pdev);
+ pdn->m64_per_iov = 1;
+ mul = phb->ioda.total_pe;
+
+ for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) {
+ res = &pdev->resource[i];
+ if (!res->flags || res->parent)
+ continue;
+ if (!pnv_pci_is_mem_pref_64(res->flags)) {
+ dev_warn(&pdev->dev, " non M64 IOV BAR %pR on %s\n",
+ res, pci_name(pdev));
+ continue;
+ }
+
+ size = pnv_pci_sriov_resource_size(pdev, i);
+
+ /* bigger than 64M */
+ if (size > (1 << 26)) {
+ dev_info(&pdev->dev, "PowerNV: VF BAR[%d] size "
+ "is bigger than 64M, roundup power2\n", i);
+ pdn->m64_per_iov = M64_PER_IOV;
+ mul = __roundup_pow_of_two(total_vfs);
+ break;
+ }
+ }
+
for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) {
res = &pdev->resource[i];
if (!res->flags || res->parent)
@@ -1911,10 +1938,10 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
dev_dbg(&pdev->dev, "PowerNV: Fixing VF BAR[%d] %pR to\n",
i, res);
size = pnv_pci_sriov_resource_size(pdev, i);
- res->end = res->start + size * phb->ioda.total_pe - 1;
+ res->end = res->start + size * mul - 1;
dev_dbg(&pdev->dev, " %pR\n", res);
}
- pdn->vfs = phb->ioda.total_pe;
+ pdn->vfs = mul;
}

static void pnv_pci_ioda_fixup_sriov(struct pci_bus *bus)
--
1.7.9.5
Wei Yang
2014-07-24 06:22:23 UTC
Permalink
This patch implements the pcibios_sriov_resource_alignment() on powernv
platform.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/machdep.h | 3 +++
arch/powerpc/kernel/pci-common.c | 14 ++++++++++++++
arch/powerpc/platforms/powernv/pci-ioda.c | 18 ++++++++++++++++++
3 files changed, 35 insertions(+)

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index fabb8016..5c023be 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -254,6 +254,9 @@ struct machdep_calls {

#ifdef CONFIG_PCI_IOV
void (*pcibios_fixup_sriov)(struct pci_bus *bus);
+ resource_size_t (*pcibios_sriov_resource_alignment)(struct pci_dev *,
+ int resno,
+ resource_size_t align);
#endif /* CONFIG_PCI_IOV */

/* Called to shutdown machine specific hardware not already controlled
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index c2b7930..942a3e5 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -140,6 +140,20 @@ void pcibios_reset_secondary_bus(struct pci_dev *dev)
ssleep(1);
}

+#ifdef CONFIG_PCI_IOV
+resource_size_t pcibios_sriov_resource_alignment(struct pci_dev *pdev,
+ int resno,
+ resource_size_t align)
+{
+ if (ppc_md.pcibios_sriov_resource_alignment)
+ return ppc_md.pcibios_sriov_resource_alignment(pdev,
+ resno,
+ align);
+
+ return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
static resource_size_t pcibios_io_size(const struct pci_controller *hose)
{
#ifdef CONFIG_PPC64
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 6fd2377..3aeb87b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1767,6 +1767,23 @@ resource_size_t pnv_pci_sriov_resource_size(struct pci_dev *pdev, int resno)

return size;
}
+
+static resource_size_t pnv_pcibios_sriov_resource_alignment(struct pci_dev *pdev,
+ int resno,
+ resource_size_t align)
+{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
+ resource_size_t iov_align;
+
+ iov_align = resource_size(&pdev->resource[resno]);
+ if (iov_align)
+ return iov_align;
+
+ if (pdn->vfs)
+ return pdn->vfs * align;
+
+ return align;
+}
#endif /* CONFIG_PCI_IOV */

/* Prevent enabling devices for which we couldn't properly
@@ -1976,6 +1993,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np,
ppc_md.pcibios_reset_secondary_bus = pnv_pci_reset_secondary_bus;
#ifdef CONFIG_PCI_IOV
ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_sriov;
+ ppc_md.pcibios_sriov_resource_alignment = pnv_pcibios_sriov_resource_alignment;
#endif /* CONFIG_PCI_IOV */
pci_add_flags(PCI_REASSIGN_ALL_RSRC);
--
1.7.9.5
Wei Yang
2014-07-24 06:22:18 UTC
Permalink
From: Gavin Shan <***@linux.vnet.ibm.com>

The PCI config accessors rely on device node. Unfortunately, VFs
don't have corresponding device nodes. So we have to switch to
pci_dn for PCI config access.

Signed-off-by: Gavin Shan <***@linux.vnet.ibm.com>
---
arch/powerpc/platforms/powernv/eeh-powernv.c | 24 ++++++++-
arch/powerpc/platforms/powernv/pci.c | 69 ++++++++++----------------
arch/powerpc/platforms/powernv/pci.h | 4 +-
3 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index f5bbc9f..357ec68 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -380,6 +380,26 @@ static int powernv_eeh_next_error(struct eeh_pe **pe)
return -EEXIST;
}

+static int powernv_eeh_cfg_read(struct device_node *dn,
+ int where, int size, u32 *val)
+{
+ struct pci_dn *pdn = PCI_DN(dn);
+
+ if (!pdn)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+ return pnv_pci_cfg_read(pdn, where, size, val);
+}
+
+static int powernv_eeh_cfg_write(struct device_node *dn,
+ int where, int size, u32 val)
+{
+ struct pci_dn *pdn = PCI_DN(dn);
+
+ if (!pdn)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+ return pnv_pci_cfg_write(pdn, where, size, val);
+}
+
static int powernv_eeh_restore_config(struct device_node *dn)
{
struct eeh_dev *edev = of_node_to_eeh_dev(dn);
@@ -414,8 +434,8 @@ static struct eeh_ops powernv_eeh_ops = {
.wait_state = powernv_eeh_wait_state,
.get_log = powernv_eeh_get_log,
.configure_bridge = powernv_eeh_configure_bridge,
- .read_config = pnv_pci_cfg_read,
- .write_config = pnv_pci_cfg_write,
+ .read_config = powernv_eeh_cfg_read,
+ .write_config = powernv_eeh_cfg_write,
.next_error = powernv_eeh_next_error,
.restore_config = powernv_eeh_restore_config
};
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index c823503..b97aa79 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -376,9 +376,9 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
spin_unlock_irqrestore(&phb->lock, flags);
}

-static void pnv_pci_config_check_eeh(struct pnv_phb *phb,
- struct device_node *dn)
+static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
{
+ struct pnv_phb *phb = pdn->phb->private_data;
u8 fstate;
__be16 pcierr;
int pe_no;
@@ -389,7 +389,7 @@ static void pnv_pci_config_check_eeh(struct pnv_phb *phb,
* setup that yet. So all ER errors should be mapped to
* reserved PE.
*/
- pe_no = PCI_DN(dn)->pe_number;
+ pe_no = pdn->pe_number;
if (pe_no == IODA_INVALID_PE) {
if (phb->type == PNV_PHB_P5IOC2)
pe_no = 0;
@@ -417,8 +417,7 @@ static void pnv_pci_config_check_eeh(struct pnv_phb *phb,
}

cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
- (PCI_DN(dn)->busno << 8) | (PCI_DN(dn)->devfn),
- pe_no, fstate);
+ (pdn->busno << 8) | (pdn->devfn), pe_no, fstate);

/* Clear the frozen state if applicable */
if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE ||
@@ -435,10 +434,9 @@ static void pnv_pci_config_check_eeh(struct pnv_phb *phb,
}
}

-int pnv_pci_cfg_read(struct device_node *dn,
+int pnv_pci_cfg_read(struct pci_dn *pdn,
int where, int size, u32 *val)
{
- struct pci_dn *pdn = PCI_DN(dn);
struct pnv_phb *phb = pdn->phb->private_data;
u32 bdfn = (pdn->busno << 8) | pdn->devfn;
s64 rc;
@@ -472,10 +470,9 @@ int pnv_pci_cfg_read(struct device_node *dn,
return PCIBIOS_SUCCESSFUL;
}

-int pnv_pci_cfg_write(struct device_node *dn,
+int pnv_pci_cfg_write(struct pci_dn *pdn,
int where, int size, u32 val)
{
- struct pci_dn *pdn = PCI_DN(dn);
struct pnv_phb *phb = pdn->phb->private_data;
u32 bdfn = (pdn->busno << 8) | pdn->devfn;

@@ -499,18 +496,17 @@ int pnv_pci_cfg_write(struct device_node *dn,
}

#if CONFIG_EEH
-static bool pnv_pci_cfg_check(struct pci_controller *hose,
- struct device_node *dn)
+static bool pnv_pci_cfg_check(struct pci_dn *pdn)
{
struct eeh_dev *edev = NULL;
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pdn->phb->private_data;

/* EEH not enabled ? */
if (!(phb->flags & PNV_PHB_FLAG_EEH))
return true;

/* PE reset or device removed ? */
- edev = of_node_to_eeh_dev(dn);
+ edev = pdn->edev;
if (edev) {
if (edev->pe &&
(edev->pe->state & EEH_PE_RESET))
@@ -523,8 +519,7 @@ static bool pnv_pci_cfg_check(struct pci_controller *hose,
return true;
}
#else
-static inline pnv_pci_cfg_check(struct pci_controller *hose,
- struct device_node *dn)
+static inline pnv_pci_cfg_check(struct pci_dn *pdn)
{
return true;
}
@@ -534,32 +529,26 @@ static int pnv_pci_read_config(struct pci_bus *bus,
unsigned int devfn,
int where, int size, u32 *val)
{
- struct device_node *dn, *busdn = pci_bus_to_OF_node(bus);
struct pci_dn *pdn;
struct pnv_phb *phb;
- bool found = false;
int ret;

*val = 0xFFFFFFFF;
- for (dn = busdn->child; dn; dn = dn->sibling) {
- pdn = PCI_DN(dn);
- if (pdn && pdn->devfn == devfn) {
- phb = pdn->phb->private_data;
- found = true;
- break;
- }
- }
+ pdn = pci_get_pdn_by_devfn(bus, devfn);
+ if (!pdn)
+ return PCIBIOS_DEVICE_NOT_FOUND;

- if (!found || !pnv_pci_cfg_check(pdn->phb, dn))
+ if (!pnv_pci_cfg_check(pdn))
return PCIBIOS_DEVICE_NOT_FOUND;

- ret = pnv_pci_cfg_read(dn, where, size, val);
- if (phb->flags & PNV_PHB_FLAG_EEH) {
+ ret = pnv_pci_cfg_read(pdn, where, size, val);
+ phb = pdn->phb->private_data;
+ if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) {
if (*val == EEH_IO_ERROR_VALUE(size) &&
- eeh_dev_check_failure(of_node_to_eeh_dev(dn)))
+ eeh_dev_check_failure(pdn->edev))
return PCIBIOS_DEVICE_NOT_FOUND;
} else {
- pnv_pci_config_check_eeh(phb, dn);
+ pnv_pci_config_check_eeh(pdn);
}

return ret;
@@ -569,27 +558,21 @@ static int pnv_pci_write_config(struct pci_bus *bus,
unsigned int devfn,
int where, int size, u32 val)
{
- struct device_node *dn, *busdn = pci_bus_to_OF_node(bus);
struct pci_dn *pdn;
struct pnv_phb *phb;
- bool found = false;
int ret;

- for (dn = busdn->child; dn; dn = dn->sibling) {
- pdn = PCI_DN(dn);
- if (pdn && pdn->devfn == devfn) {
- phb = pdn->phb->private_data;
- found = true;
- break;
- }
- }
+ pdn = pci_get_pdn_by_devfn(bus, devfn);
+ if (!pdn)
+ return PCIBIOS_DEVICE_NOT_FOUND;

- if (!found || !pnv_pci_cfg_check(pdn->phb, dn))
+ if (!pnv_pci_cfg_check(pdn))
return PCIBIOS_DEVICE_NOT_FOUND;

- ret = pnv_pci_cfg_write(dn, where, size, val);
+ ret = pnv_pci_cfg_write(pdn, where, size, val);
+ phb = pdn->phb->private_data;
if (!(phb->flags & PNV_PHB_FLAG_EEH))
- pnv_pci_config_check_eeh(phb, dn);
+ pnv_pci_config_check_eeh(pdn);

return ret;
}
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index b160e6b..bff26d9 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -215,9 +215,9 @@ extern struct pnv_eeh_ops ioda_eeh_ops;

void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
unsigned char *log_buff);
-int pnv_pci_cfg_read(struct device_node *dn,
+int pnv_pci_cfg_read(struct pci_dn *pdn,
int where, int size, u32 *val);
-int pnv_pci_cfg_write(struct device_node *dn,
+int pnv_pci_cfg_write(struct pci_dn *pdn,
int where, int size, u32 val);
extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
void *tce_mem, u64 tce_size,
--
1.7.9.5
Wei Yang
2014-07-24 06:22:21 UTC
Permalink
On PowerNV platform, it will support dynamic PE allocation and deallocation.

This patch adds a function to release those resources related to a PE. Also
fix a bug when it is the root bus, there is no bridge associated.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/platforms/powernv/pci-ioda.c | 90 ++++++++++++++++++++++++++++-
1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index d66a76b..8318b07 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -505,6 +505,93 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
}
#endif /* CONFIG_PCI_MSI */

+static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+{
+ struct pci_dev *parent;
+ uint8_t bcomp, dcomp, fcomp;
+ int64_t rc;
+ long rid_end, rid;
+
+ /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
+ if (pe->pbus) {
+ int count;
+
+ dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+ parent = pe->pbus->self;
+ if (pe->flags & PNV_IODA_PE_BUS_ALL)
+ count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+ else
+ count = 1;
+
+ switch(count) {
+ case 1: bcomp = OpalPciBusAll; break;
+ case 2: bcomp = OpalPciBus7Bits; break;
+ case 4: bcomp = OpalPciBus6Bits; break;
+ case 8: bcomp = OpalPciBus5Bits; break;
+ case 16: bcomp = OpalPciBus4Bits; break;
+ case 32: bcomp = OpalPciBus3Bits; break;
+ default:
+ pr_err("%s: Number of subordinate busses %d"
+ " unsupported\n",
+ pci_is_root_bus(pe->pbus)?"root bus":pci_name(pe->pbus->self),
+ count);
+ /* Do an exact match only */
+ bcomp = OpalPciBusAll;
+ }
+ rid_end = pe->rid + (count << 8);
+ } else {
+ parent = pe->pdev->bus->self;
+ bcomp = OpalPciBusAll;
+ dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+ fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+ rid_end = pe->rid + 1;
+ }
+
+ /* Disable MVT on IODA1 */
+ if (phb->type == PNV_PHB_IODA1) {
+ rc = opal_pci_set_mve_enable(phb->opal_id,
+ pe->mve_number, OPAL_DISABLE_MVE);
+ if (rc) {
+ pe_err(pe, "OPAL error %ld enabling MVE %d\n",
+ rc, pe->mve_number);
+ pe->mve_number = -1;
+ }
+ }
+ /* Clear the reverse map */
+ for (rid = pe->rid; rid < rid_end; rid++)
+ phb->ioda.pe_rmap[rid] = 0;
+
+ /* Release from all parents PELT-V */
+ while (parent) {
+ struct pci_dn *pdn = pci_get_pdn(parent);
+ if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+ rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+ pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+ /* XXX What to do in case of error ? */
+ }
+ parent = parent->bus->self;
+ }
+
+ opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+ /* Dissociate PE in PELT */
+ rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
+ pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+ if (rc)
+ pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
+ rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+ bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
+ if (rc)
+ pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+
+ pe->pbus = NULL;
+ pe->pdev = NULL;
+
+ return 0;
+}
+
static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
{
struct pci_dev *parent;
@@ -533,7 +620,8 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
default:
pr_err("%s: Number of subordinate busses %d"
" unsupported\n",
- pci_name(pe->pbus->self), count);
+ pci_is_root_bus(pe->pbus)?"root bus":pci_name(pe->pbus->self),
+ count);
/* Do an exact match only */
bcomp = OpalPciBusAll;
}
--
1.7.9.5
Wei Yang
2014-07-24 06:22:27 UTC
Permalink
When IOV BAR is big, each of it is covered by 4 M64 window. This leads to
several VF PE sits in one PE in terms of M64.

This patch group VF PEs according to the M64 allocation.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/pci-bridge.h | 2 +-
arch/powerpc/platforms/powernv/pci-ioda.c | 187 +++++++++++++++++++++++------
2 files changed, 149 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 9c2c826..41e52e3 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -180,7 +180,7 @@ struct pci_dn {
#define M64_PER_IOV 4
int m64_per_iov;
#define IODA_INVALID_M64 (-1)
- int m64_wins[PCI_SRIOV_NUM_BARS];
+ int m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
#endif /* CONFIG_PCI_IOV */
#endif
struct list_head child_list;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 98fa01d..88aa14f 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -997,26 +997,27 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
struct pci_controller *hose;
struct pnv_phb *phb;
struct pci_dn *pdn;
- int i;
+ int i, j;

bus = pdev->bus;
hose = pci_bus_to_host(bus);
phb = hose->private_data;
pdn = pci_get_pdn(pdev);

- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- if (pdn->m64_wins[i] == IODA_INVALID_M64)
- continue;
- opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 0);
- clear_bit(pdn->m64_wins[i], &phb->ioda.m64_bar_alloc);
- pdn->m64_wins[i] = IODA_INVALID_M64;
- }
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+ for (j = 0; j < M64_PER_IOV; j++) {
+ if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
+ continue;
+ opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
+ clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
+ pdn->m64_wins[i][j] = IODA_INVALID_M64;
+ }

return 0;
}

-static int pnv_pci_vf_assign_m64(struct pci_dev *pdev)
+static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 vf_num)
{
struct pci_bus *bus;
struct pci_controller *hose;
@@ -1024,17 +1025,33 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev)
struct pci_dn *pdn;
unsigned int win;
struct resource *res;
- int i;
+ int i, j;
int64_t rc;
+ int total_vfs;
+ resource_size_t size, start;
+ int pe_num;
+ int vf_groups;
+ int vf_per_group;

bus = pdev->bus;
hose = pci_bus_to_host(bus);
phb = hose->private_data;
pdn = pci_get_pdn(pdev);
+ total_vfs = pci_sriov_get_totalvfs(pdev);

/* Initialize the m64_wins to IODA_INVALID_M64 */
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
- pdn->m64_wins[i] = IODA_INVALID_M64;
+ for (j = 0; j < M64_PER_IOV; j++)
+ pdn->m64_wins[i][j] = IODA_INVALID_M64;
+
+ if (pdn->m64_per_iov == M64_PER_IOV) {
+ vf_groups = (vf_num <= M64_PER_IOV) ? vf_num: M64_PER_IOV;
+ vf_per_group = (vf_num <= M64_PER_IOV)? 1:
+ __roundup_pow_of_two(vf_num) / pdn->m64_per_iov;
+ } else {
+ vf_groups = 1;
+ vf_per_group = 1;
+ }

for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = pdev->resource + PCI_IOV_RESOURCES + i;
@@ -1044,33 +1061,61 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev)
if (!pnv_pci_is_mem_pref_64(res->flags))
continue;

- do {
- win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
- phb->ioda.m64_bar_idx + 1, 0);
-
- if (win >= phb->ioda.m64_bar_idx + 1)
- goto m64_failed;
- } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+ for (j = 0; j < vf_groups; j++) {
+ do {
+ win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+ phb->ioda.m64_bar_idx + 1, 0);
+
+ if (win >= phb->ioda.m64_bar_idx + 1)
+ goto m64_failed;
+ } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+ pdn->m64_wins[i][j] = win;
+
+ if (pdn->m64_per_iov == M64_PER_IOV) {
+ size = pnv_pci_sriov_resource_size(pdev,
+ PCI_IOV_RESOURCES + i);
+ size = size * vf_per_group;
+ start = res->start + size * j;
+ } else {
+ size = resource_size(res);
+ start = res->start;
+ }

- pdn->m64_wins[i] = win;
+ /* Map the M64 here */
+ if (pdn->m64_per_iov == M64_PER_IOV) {
+ pe_num = pdn->offset + j;
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe_num, OPAL_M64_WINDOW_TYPE,
+ pdn->m64_wins[i][j], 0);
+ }

- /* Map the M64 here */
- rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
OPAL_M64_WINDOW_TYPE,
- pdn->m64_wins[i],
- res->start,
+ pdn->m64_wins[i][j],
+ start,
0, /* unused */
- resource_size(res));
- if (rc != OPAL_SUCCESS) {
- pr_err("Failed to map M64 BAR #%d: %lld\n", win, rc);
- goto m64_failed;
- }
+ size);

- rc = opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i], 1);
- if (rc != OPAL_SUCCESS) {
- pr_err("Failed to enable M64 BAR #%d: %llx\n", win, rc);
- goto m64_failed;
+
+ if (rc != OPAL_SUCCESS) {
+ pr_err("Failed to set M64 BAR #%d: %lld\n",
+ win, rc);
+ goto m64_failed;
+ }
+
+ if (pdn->m64_per_iov == M64_PER_IOV)
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
+ else
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
+
+ if (rc != OPAL_SUCCESS) {
+ pr_err("Failed to enable M64 BAR #%d: %llx\n",
+ win, rc);
+ goto m64_failed;
+ }
}
}
return 0;
@@ -1111,21 +1156,53 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
pe->tce32_table = NULL;
}

-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 vf_num)
{
struct pci_bus *bus;
struct pci_controller *hose;
struct pnv_phb *phb;
struct pnv_ioda_pe *pe, *pe_n;
struct pci_dn *pdn;
+ u16 vf_index;
+ int64_t rc;

bus = pdev->bus;
hose = pci_bus_to_host(bus);
phb = hose->private_data;
+ pdn = pci_get_pdn(pdev);

if (!pdev->is_physfn)
return;

+ if (pdn->m64_per_iov == M64_PER_IOV && vf_num > M64_PER_IOV) {
+ int vf_group;
+ int vf_per_group;
+ int vf_index1;
+
+ vf_per_group = __roundup_pow_of_two(vf_num) / pdn->m64_per_iov;
+
+ for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
+ for (vf_index = vf_group * vf_per_group;
+ vf_index < (vf_group + 1) * vf_per_group &&
+ vf_index < vf_num;
+ vf_index++)
+ for (vf_index1 = vf_group * vf_per_group;
+ vf_index1 < (vf_group + 1) * vf_per_group &&
+ vf_index1 < vf_num;
+ vf_index1++){
+
+ rc = opal_pci_set_peltv(phb->opal_id,
+ pdn->offset + vf_index,
+ pdn->offset + vf_index1,
+ OPAL_REMOVE_PE_FROM_DOMAIN);
+
+ if (rc)
+ pr_warn("%s: Failed to unlink same"
+ " group PE#%d(%lld)\n", __func__,
+ pdn->offset + vf_index1, rc);
+ }
+ }
+
pdn = pci_get_pdn(pdev);
list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
if (pe->parent_dev != pdev)
@@ -1161,10 +1238,11 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
vf_num = pdn->vf_pes;

/* Release VF PEs */
- pnv_ioda_release_vf_PE(pdev);
+ pnv_ioda_release_vf_PE(pdev, vf_num);

if (phb->type == PNV_PHB_IODA2) {
- pnv_pci_vf_resource_shift(pdev, -pdn->offset);
+ if (pdn->m64_per_iov == 1)
+ pnv_pci_vf_resource_shift(pdev, -pdn->offset);

/* Release M64 BARs */
pnv_pci_vf_release_m64(pdev);
@@ -1188,6 +1266,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 vf_num)
int pe_num;
u16 vf_index;
struct pci_dn *pdn;
+ int64_t rc;

bus = pdev->bus;
hose = pci_bus_to_host(bus);
@@ -1235,7 +1314,36 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 vf_num)
mutex_unlock(&phb->ioda.pe_list_mutex);

pnv_pci_ioda2_setup_dma_pe(phb, pe);
+ }

+ if (pdn->m64_per_iov == M64_PER_IOV && vf_num > M64_PER_IOV) {
+ int vf_group;
+ int vf_per_group;
+ int vf_index1;
+
+ vf_per_group = __roundup_pow_of_two(vf_num) / pdn->m64_per_iov;
+
+ for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
+ for (vf_index = vf_group * vf_per_group;
+ vf_index < (vf_group + 1) * vf_per_group &&
+ vf_index < vf_num;
+ vf_index++)
+ for (vf_index1 = vf_group * vf_per_group;
+ vf_index1 < (vf_group + 1) * vf_per_group &&
+ vf_index1 < vf_num;
+ vf_index1++) {
+
+ rc = opal_pci_set_peltv(phb->opal_id,
+ pdn->offset + vf_index,
+ pdn->offset + vf_index1,
+ OPAL_ADD_PE_TO_DOMAIN);
+
+ if (rc)
+ pr_warn("%s: Failed to link same "
+ "group PE#%d(%lld)\n",
+ __func__,
+ pdn->offset + vf_index1, rc);
+ }
}
}

@@ -1274,14 +1382,15 @@ try_again:
mutex_unlock(&phb->ioda.pe_alloc_mutex);

/* Assign M64 BAR accordingly */
- ret = pnv_pci_vf_assign_m64(pdev);
+ ret = pnv_pci_vf_assign_m64(pdev, vf_num);
if (ret) {
pr_info("No enough M64 resource\n");
goto m64_failed;
}

/* Do some magic shift */
- pnv_pci_vf_resource_shift(pdev, pdn->offset);
+ if (pdn->m64_per_iov == 1)
+ pnv_pci_vf_resource_shift(pdev, pdn->offset);
}

/* Setup VF PEs */
--
1.7.9.5
Wei Yang
2014-07-24 06:22:12 UTC
Permalink
Current implementation calculates VF BAR size from dividing the total size of
IOV BAR by total VF number. It won't work on PowerNV platform because we're
going to expand IOV BAR size for finely alignment.

The patch enforces getting IOV BAR size from hardware and then calculate
the VF BAR size based on that when platform wants so.

Signed-off-by: Wei Yang <***@linux.vnet.ibm.com>
---
drivers/pci/iov.c | 28 ++++++++++++++++++++++++----
include/linux/ioport.h | 1 +
2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 7566238..ef1c546 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -55,6 +55,9 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
struct resource *res;
struct pci_sriov *iov = dev->sriov;
struct pci_bus *bus;
+ struct resource tmp;
+ enum pci_bar_type type;
+ int reg;

mutex_lock(&iov->dev->sriov->lock);
bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
@@ -80,12 +83,29 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
continue;
virtfn->resource[i].name = pci_name(virtfn);
virtfn->resource[i].flags = res->flags;
- size = resource_size(res);
- do_div(size, iov->total_VFs);
+ /* When res has IORESOURCE_ARCH, retrieve the IOV BAR size
+ * from hardware directly.
+ */
+ if (res->flags & IORESOURCE_ARCH) {
+ reg = pci_iov_resource_bar(dev, i + PCI_IOV_RESOURCES, &type);
+ __pci_read_base(dev, type, &tmp, reg);
+ size = resource_size(&tmp);
+ /* When __pci_read_base fails, flags is set to 0.
+ * In this case, reset size to 0, which means the VF
+ * will not be enabled.
+ */
+ if (!tmp.flags)
+ size = 0;
+ } else {
+ size = resource_size(res);
+ do_div(size, iov->total_VFs);
+ }
virtfn->resource[i].start = res->start + size * id;
virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
- rc = request_resource(res, &virtfn->resource[i]);
- BUG_ON(rc);
+ if (resource_size(&virtfn->resource[i])) {
+ rc = request_resource(res, &virtfn->resource[i]);
+ BUG_ON(rc);
+ }
}

if (reset)
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5e3a906..de8b57c 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -48,6 +48,7 @@ struct resource {
#define IORESOURCE_MEM_64 0x00100000
#define IORESOURCE_WINDOW 0x00200000 /* forwarded by bridge */
#define IORESOURCE_MUXED 0x00400000 /* Resource is software muxed */
+#define IORESOURCE_ARCH 0x00800000 /* Resource arch tagged */

#define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */
#define IORESOURCE_DISABLED 0x10000000
--
1.7.9.5
Bjorn Helgaas
2014-08-19 21:44:59 UTC
Permalink
Post by Wei Yang
Current implementation calculates VF BAR size from dividing the total size of
IOV BAR by total VF number. It won't work on PowerNV platform because we're
going to expand IOV BAR size for finely alignment.
The patch enforces getting IOV BAR size from hardware and then calculate
the VF BAR size based on that when platform wants so.
---
drivers/pci/iov.c | 28 ++++++++++++++++++++++++----
include/linux/ioport.h | 1 +
2 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 7566238..ef1c546 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -55,6 +55,9 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
struct resource *res;
struct pci_sriov *iov = dev->sriov;
struct pci_bus *bus;
+ struct resource tmp;
+ enum pci_bar_type type;
+ int reg;
mutex_lock(&iov->dev->sriov->lock);
bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
@@ -80,12 +83,29 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
continue;
virtfn->resource[i].name = pci_name(virtfn);
virtfn->resource[i].flags = res->flags;
- size = resource_size(res);
- do_div(size, iov->total_VFs);
+ /* When res has IORESOURCE_ARCH, retrieve the IOV BAR size
+ * from hardware directly.
+ */
+ if (res->flags & IORESOURCE_ARCH) {
+ reg = pci_iov_resource_bar(dev, i + PCI_IOV_RESOURCES, &type);
+ __pci_read_base(dev, type, &tmp, reg);
+ size = resource_size(&tmp);
+ /* When __pci_read_base fails, flags is set to 0.
+ * In this case, reset size to 0, which means the VF
+ * will not be enabled.
+ */
+ if (!tmp.flags)
+ size = 0;
I don't like the IORESOURCE_ARCH flag because it really doesn't have any
specific meaning. You're using it to enable some arch-specific code here
for this specific case. But there are any number of other places that
could do something similar, and there's no way to coordinate them all.

I'd rather have some sort of pcibios_*() hook here where powerpc could
override the default implementation.
Post by Wei Yang
+ } else {
+ size = resource_size(res);
+ do_div(size, iov->total_VFs);
+ }
virtfn->resource[i].start = res->start + size * id;
virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
- rc = request_resource(res, &virtfn->resource[i]);
- BUG_ON(rc);
+ if (resource_size(&virtfn->resource[i])) {
+ rc = request_resource(res, &virtfn->resource[i]);
+ BUG_ON(rc);
+ }
}
if (reset)
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5e3a906..de8b57c 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -48,6 +48,7 @@ struct resource {
#define IORESOURCE_MEM_64 0x00100000
#define IORESOURCE_WINDOW 0x00200000 /* forwarded by bridge */
#define IORESOURCE_MUXED 0x00400000 /* Resource is software muxed */
+#define IORESOURCE_ARCH 0x00800000 /* Resource arch tagged */
#define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */
#define IORESOURCE_DISABLED 0x10000000
--
1.7.9.5
Wei Yang
2014-08-20 02:31:18 UTC
Permalink
Post by Bjorn Helgaas
Post by Wei Yang
Current implementation calculates VF BAR size from dividing the total size of
IOV BAR by total VF number. It won't work on PowerNV platform because we're
going to expand IOV BAR size for finely alignment.
The patch enforces getting IOV BAR size from hardware and then calculate
the VF BAR size based on that when platform wants so.
---
drivers/pci/iov.c | 28 ++++++++++++++++++++++++----
include/linux/ioport.h | 1 +
2 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 7566238..ef1c546 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -55,6 +55,9 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
struct resource *res;
struct pci_sriov *iov = dev->sriov;
struct pci_bus *bus;
+ struct resource tmp;
+ enum pci_bar_type type;
+ int reg;
mutex_lock(&iov->dev->sriov->lock);
bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
@@ -80,12 +83,29 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
continue;
virtfn->resource[i].name = pci_name(virtfn);
virtfn->resource[i].flags = res->flags;
- size = resource_size(res);
- do_div(size, iov->total_VFs);
+ /* When res has IORESOURCE_ARCH, retrieve the IOV BAR size
+ * from hardware directly.
+ */
+ if (res->flags & IORESOURCE_ARCH) {
+ reg = pci_iov_resource_bar(dev, i + PCI_IOV_RESOURCES, &type);
+ __pci_read_base(dev, type, &tmp, reg);
+ size = resource_size(&tmp);
+ /* When __pci_read_base fails, flags is set to 0.
+ * In this case, reset size to 0, which means the VF
+ * will not be enabled.
+ */
+ if (!tmp.flags)
+ size = 0;
I don't like the IORESOURCE_ARCH flag because it really doesn't have any
specific meaning. You're using it to enable some arch-specific code here
for this specific case. But there are any number of other places that
could do something similar, and there's no way to coordinate them all.
I'd rather have some sort of pcibios_*() hook here where powerpc could
override the default implementation.
Yep, got it. I will write a pcibios_sriov_resource_size() and override it in
powerpc arch.
Post by Bjorn Helgaas
Post by Wei Yang
+ } else {
+ size = resource_size(res);
+ do_div(size, iov->total_VFs);
+ }
virtfn->resource[i].start = res->start + size * id;
virtfn->resource[i].end = virtfn->resource[i].start + size - 1;
- rc = request_resource(res, &virtfn->resource[i]);
- BUG_ON(rc);
+ if (resource_size(&virtfn->resource[i])) {
+ rc = request_resource(res, &virtfn->resource[i]);
+ BUG_ON(rc);
+ }
}
if (reset)
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5e3a906..de8b57c 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -48,6 +48,7 @@ struct resource {
#define IORESOURCE_MEM_64 0x00100000
#define IORESOURCE_WINDOW 0x00200000 /* forwarded by bridge */
#define IORESOURCE_MUXED 0x00400000 /* Resource is software muxed */
+#define IORESOURCE_ARCH 0x00800000 /* Resource arch tagged */
#define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */
#define IORESOURCE_DISABLED 0x10000000
--
1.7.9.5
--
Richard Yang
Help you, Help me
Benjamin Herrenschmidt
2014-07-31 06:35:10 UTC
Permalink
This patch set enables the SRIOV on POWER8.
Hi Bjorn !

There are 4 patches in there to the generic code, but so far not much
review from your side of the fence :-)

How do you want to proceed ?

Cheers,
Ben.
The gerneral idea is put each VF into one individual PE and allocate required
resources like DMA/MSI.
One thing special for VF PE is we use M64BT to cover the IOV BAR. M64BT is one
hardware on POWER platform to map MMIO address to PE. By using M64BT, we could
map one individual VF to a VF PE, which introduce more flexiblity to users.
To achieve this effect, we need to do some hack on pci devices's resources.
1. Expand the IOV BAR properly.
Done by pnv_pci_ioda_fixup_iov_resources().
2. Shift the IOV BAR properly.
Done by pnv_pci_vf_resource_shift().
3. IOV BAR alignment is the total size instead of an individual size on
powernv platform.
Done by pnv_pcibios_sriov_resource_alignment().
4. Take the IOV BAR alignment into consideration in the sizing and assigning.
This is achieved by commit: "PCI: Take additional IOV BAR alignment in
sizing and assigning"
The SRIOV device tested is Emulex Lancer and Mellanox ConnectX-3 on
POWER8.
1. install necessary modules
modprobe vfio
modprobe vfio-pci
2. retrieve the iommu_group the device belongs to
readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group
../../../../kernel/iommu_groups/26
This means it belongs to group 26
3. see how many devices under this iommu_group
ls /sys/kernel/iommu_groups/26/devices/
4. unbind the original driver and bind to vfio-pci driver
echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
Note: this should be done for each device in the same iommu_group
5. Start qemu and pass device through vfio
/home/ywywyang/git/qemu-impreza/ppc64-softmmu/qemu-system-ppc64 \
-M pseries -m 2048 -enable-kvm -nographic \
-drive file=/home/ywywyang/kvm/fc19.img \
-monitor telnet:localhost:5435,server,nowait -boot cd \
-device "spapr-pci-vfio-host-bridge,id=CXGB3,iommu=26,index=6"
1. ping from a machine in the same subnet(the broadcast domain)
2. run arp -n on this machine
9.115.251.20 ether 00:00:c9:df:ed:bf C eth0
3. ifconfig in the guest
# ifconfig eth1
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 9.115.251.20 netmask 255.255.255.0 broadcast 9.115.251.255
inet6 fe80::200:c9ff:fedf:edbf prefixlen 64 scopeid 0x20<link>
ether 00:00:c9:df:ed:bf txqueuelen 1000 (Ethernet)
RX packets 175 bytes 13278 (12.9 KiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 58 bytes 9276 (9.0 KiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
4. They have the same MAC address
Note: make sure you shutdown other network interfaces in guest.
---
1. add IORESOURCE_ARCH flag for IOV BAR on powernv platform.
2. when IOV BAR has IORESOURCE_ARCH flag, the size is retrieved from
hardware directly. If not, calculate as usual.
PCI, powerpc, powernv
4. rebase it on 3.16-rc6
1. remove pcibios_enable_sriov()/pcibios_disable_sriov() weak function
similar function is moved to
pnv_pci_enable_device_hook()/pnv_pci_disable_device_hook(). When PF is
enabled, platform will try best to allocate resources for VFs.
2. remove pcibios_sriov_resource_size weak function
3. VF BAR size is retrieved from hardware directly in virtfn_add()
1. merge those SRIOV related platform functions in machdep_calls
wrap them in one CONFIG_PCI_IOV marco
2. define IODA_INVALID_M64 to replace (-1)
use this value to represent the m64_wins is not used
3. rename pnv_pci_release_dev_dma() to pnv_pci_ioda2_release_dma_pe()
this function is a conterpart to pnv_pci_ioda2_setup_dma_pe()
4. change dev_info() to dev_dgb() in pnv_pci_ioda_fixup_iov_resources()
reduce some log in kernel
5. release M64 window in pnv_pci_ioda2_release_dma_pe()
1. code format fix, eg. not exceed 80 chars
2. in commit "ppc/pnv: Add function to deconfig a PE"
check the bus has a bridge before print the name
remove a PE from its own PELTV
3. change the function name for sriov resource size/alignment
4. rebase on 3.16-rc3
5. VFs will not rely on device node
As Grant Likely's comments, kernel should have the ability to handle the
lack of device_node gracefully. Gavin restructure the pci_dn, which
makes the VF will have pci_dn even when VF's device_node is not provided
by firmware.
6. clean all the patch title to make them comply with one style
7. fix return value for pci_iov_virtfn_bus/pci_iov_virtfn_devfn
1. change the return type of virtfn_bus/virtfn_devfn to int
change the name of these two functions to pci_iov_virtfn_bus/pci_iov_virtfn_devfn
2. reduce the second parameter or pcibios_sriov_disable()
3. use data instead of pe in "ppc/pnv: allocate pe->iommu_table dynamically"
4. rename __pci_sriov_resource_size to pcibios_sriov_resource_size
5. rename __pci_sriov_resource_alignment to pcibios_sriov_resource_alignment
1. change the return value of virtfn_bus/virtfn_devfn to 0
2. move some TCE related marco definition to
arch/powerpc/platforms/powernv/pci.h
3. fix the __pci_sriov_resource_alignment on powernv platform
During the sizing stage, the IOV BAR is truncated to 0, which will
effect the order of allocation. Fix this, so that make sure BAR will be
allocated ordered by their alignment.
1. improve the change log for
"PCI: Add weak __pci_sriov_resource_size() interface"
"PCI: Add weak __pci_sriov_resource_alignment() interface"
"PCI: take additional IOV BAR alignment in sizing and assigning"
2. wrap VF PE code in CONFIG_PCI_IOV
3. did regression test on P7.
powrepc/pci: Refactor pci_dn
powerpc/powernv: Use pci_dn in PCI config accessor
PCI/IOV: Export interface for retrieve VF's BDF
PCI/IOV: Get VF BAR size from hardware directly when platform needs
PCI: Add weak pcibios_sriov_resource_alignment() interface
PCI: Take additional IOV BAR alignment in sizing and assigning
powerpc/pci: Don't unset pci resources for VFs
powerpc/pci: Define pcibios_disable_device() on powerpc
powerpc/powernv: mark IOV BAR with IORESOURCE_ARCH
powerpc/powernv: Allocate pe->iommu_table dynamically
powerpc/powernv: Add function to deconfig a PE
powerpc/powernv: Expand VF resources according to the number of
total_pe
powerpc/powernv: Implement pcibios_sriov_resource_alignment on
powernv
powerpc/powernv: Shift VF resource with an offset
powerpc/powernv: Allocate VF PE
powerpc/powernv: Expanding IOV BAR, with m64_per_iov supported
powerpc/powernv: Group VF PE when IOV BAR is big on PHB3
arch/powerpc/include/asm/device.h | 3 +
arch/powerpc/include/asm/iommu.h | 3 +
arch/powerpc/include/asm/machdep.h | 12 +-
arch/powerpc/include/asm/pci-bridge.h | 23 +-
arch/powerpc/kernel/pci-common.c | 31 ++
arch/powerpc/kernel/pci-hotplug.c | 3 +
arch/powerpc/kernel/pci_dn.c | 248 ++++++++-
arch/powerpc/platforms/powernv/eeh-powernv.c | 24 +-
arch/powerpc/platforms/powernv/pci-ioda.c | 772 +++++++++++++++++++++++++-
arch/powerpc/platforms/powernv/pci.c | 107 ++--
arch/powerpc/platforms/powernv/pci.h | 15 +-
drivers/pci/iov.c | 65 ++-
drivers/pci/pci.h | 19 -
drivers/pci/setup-bus.c | 68 ++-
include/linux/ioport.h | 1 +
include/linux/pci.h | 47 ++
16 files changed, 1311 insertions(+), 130 deletions(-)
Bjorn Helgaas
2014-08-19 21:19:42 UTC
Permalink
This patch set enables the SRIOV on POWER8.
The gerneral idea is put each VF into one individual PE and allocate required
resources like DMA/MSI.
One thing special for VF PE is we use M64BT to cover the IOV BAR. M64BT is one
hardware on POWER platform to map MMIO address to PE. By using M64BT, we could
map one individual VF to a VF PE, which introduce more flexiblity to users.
To achieve this effect, we need to do some hack on pci devices's resources.
1. Expand the IOV BAR properly.
Done by pnv_pci_ioda_fixup_iov_resources().
2. Shift the IOV BAR properly.
Done by pnv_pci_vf_resource_shift().
3. IOV BAR alignment is the total size instead of an individual size on
powernv platform.
Done by pnv_pcibios_sriov_resource_alignment().
4. Take the IOV BAR alignment into consideration in the sizing and assigning.
This is achieved by commit: "PCI: Take additional IOV BAR alignment in
sizing and assigning"
The SRIOV device tested is Emulex Lancer and Mellanox ConnectX-3 on
POWER8.
1. install necessary modules
modprobe vfio
modprobe vfio-pci
2. retrieve the iommu_group the device belongs to
readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group
../../../../kernel/iommu_groups/26
This means it belongs to group 26
3. see how many devices under this iommu_group
ls /sys/kernel/iommu_groups/26/devices/
4. unbind the original driver and bind to vfio-pci driver
echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
Note: this should be done for each device in the same iommu_group
5. Start qemu and pass device through vfio
/home/ywywyang/git/qemu-impreza/ppc64-softmmu/qemu-system-ppc64 \
-M pseries -m 2048 -enable-kvm -nographic \
-drive file=/home/ywywyang/kvm/fc19.img \
-monitor telnet:localhost:5435,server,nowait -boot cd \
-device "spapr-pci-vfio-host-bridge,id=CXGB3,iommu=26,index=6"
1. ping from a machine in the same subnet(the broadcast domain)
2. run arp -n on this machine
9.115.251.20 ether 00:00:c9:df:ed:bf C eth0
3. ifconfig in the guest
# ifconfig eth1
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 9.115.251.20 netmask 255.255.255.0 broadcast 9.115.251.255
inet6 fe80::200:c9ff:fedf:edbf prefixlen 64 scopeid 0x20<link>
ether 00:00:c9:df:ed:bf txqueuelen 1000 (Ethernet)
RX packets 175 bytes 13278 (12.9 KiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 58 bytes 9276 (9.0 KiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
4. They have the same MAC address
Note: make sure you shutdown other network interfaces in guest.
---
1. add IORESOURCE_ARCH flag for IOV BAR on powernv platform.
2. when IOV BAR has IORESOURCE_ARCH flag, the size is retrieved from
hardware directly. If not, calculate as usual.
PCI, powerpc, powernv
4. rebase it on 3.16-rc6
This doesn't apply for me on v3.16-rc6:

02:48:57 ~/linux$ stg rebase v3.16-rc6
Checking for changes in the working directory ... done
Rebasing to "v3.16-rc6" ... done
No patches applied
02:49:14 ~/linux$ stg import -M --sign m/wy
Checking for changes in the working directory ... done
Importing patch "pci-iov-export-interface-for" ... done
Importing patch "pci-iov-get-vf-bar-size-from" ... done
Importing patch "pci-add-weak" ... done
Importing patch "pci-take-additional-iov-bar" ... done
Importing patch "powerpc-pci-don-t-unset-pci" ... done
Importing patch "powerpc-pci-define" ... done
Importing patch "powrepc-pci-refactor-pci_dn" ... done
Importing patch "powerpc-powernv-use-pci_dn-in" ... error: patch failed:
arch/powerpc/platforms/powernv/pci.c:376
error: arch/powerpc/platforms/powernv/pci.c: patch does not apply
stg import: Diff does not apply cleanly

What am I missing?

I assume you intend these all to go through my tree just to keep them all
together. The ideal rebase target for me would be v3.17-rc1.

Given the arch/powerpc parts, I'll want an ack from Ben. I just chatted
with him about these, so I assume that's not a problem, but we should make
it explicit.

Bjorn
Wei Yang
2014-08-20 02:34:04 UTC
Permalink
Post by Bjorn Helgaas
This patch set enables the SRIOV on POWER8.
The gerneral idea is put each VF into one individual PE and allocate required
resources like DMA/MSI.
One thing special for VF PE is we use M64BT to cover the IOV BAR. M64BT is one
hardware on POWER platform to map MMIO address to PE. By using M64BT, we could
map one individual VF to a VF PE, which introduce more flexiblity to users.
To achieve this effect, we need to do some hack on pci devices's resources.
1. Expand the IOV BAR properly.
Done by pnv_pci_ioda_fixup_iov_resources().
2. Shift the IOV BAR properly.
Done by pnv_pci_vf_resource_shift().
3. IOV BAR alignment is the total size instead of an individual size on
powernv platform.
Done by pnv_pcibios_sriov_resource_alignment().
4. Take the IOV BAR alignment into consideration in the sizing and assigning.
This is achieved by commit: "PCI: Take additional IOV BAR alignment in
sizing and assigning"
The SRIOV device tested is Emulex Lancer and Mellanox ConnectX-3 on
POWER8.
1. install necessary modules
modprobe vfio
modprobe vfio-pci
2. retrieve the iommu_group the device belongs to
readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group
../../../../kernel/iommu_groups/26
This means it belongs to group 26
3. see how many devices under this iommu_group
ls /sys/kernel/iommu_groups/26/devices/
4. unbind the original driver and bind to vfio-pci driver
echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
Note: this should be done for each device in the same iommu_group
5. Start qemu and pass device through vfio
/home/ywywyang/git/qemu-impreza/ppc64-softmmu/qemu-system-ppc64 \
-M pseries -m 2048 -enable-kvm -nographic \
-drive file=/home/ywywyang/kvm/fc19.img \
-monitor telnet:localhost:5435,server,nowait -boot cd \
-device "spapr-pci-vfio-host-bridge,id=CXGB3,iommu=26,index=6"
1. ping from a machine in the same subnet(the broadcast domain)
2. run arp -n on this machine
9.115.251.20 ether 00:00:c9:df:ed:bf C eth0
3. ifconfig in the guest
# ifconfig eth1
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 9.115.251.20 netmask 255.255.255.0 broadcast 9.115.251.255
inet6 fe80::200:c9ff:fedf:edbf prefixlen 64 scopeid 0x20<link>
ether 00:00:c9:df:ed:bf txqueuelen 1000 (Ethernet)
RX packets 175 bytes 13278 (12.9 KiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 58 bytes 9276 (9.0 KiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
4. They have the same MAC address
Note: make sure you shutdown other network interfaces in guest.
---
1. add IORESOURCE_ARCH flag for IOV BAR on powernv platform.
2. when IOV BAR has IORESOURCE_ARCH flag, the size is retrieved from
hardware directly. If not, calculate as usual.
PCI, powerpc, powernv
4. rebase it on 3.16-rc6
02:48:57 ~/linux$ stg rebase v3.16-rc6
Checking for changes in the working directory ... done
Rebasing to "v3.16-rc6" ... done
No patches applied
02:49:14 ~/linux$ stg import -M --sign m/wy
Checking for changes in the working directory ... done
Importing patch "pci-iov-export-interface-for" ... done
Importing patch "pci-iov-get-vf-bar-size-from" ... done
Importing patch "pci-add-weak" ... done
Importing patch "pci-take-additional-iov-bar" ... done
Importing patch "powerpc-pci-don-t-unset-pci" ... done
Importing patch "powerpc-pci-define" ... done
Importing patch "powrepc-pci-refactor-pci_dn" ... done
arch/powerpc/platforms/powernv/pci.c:376
error: arch/powerpc/platforms/powernv/pci.c: patch does not apply
stg import: Diff does not apply cleanly
What am I missing?
I assume you intend these all to go through my tree just to keep them all
together. The ideal rebase target for me would be v3.17-rc1.
Ok, I will rebase it on v3.17-rc1 upstream. While I guess the conflict is due
to some patches from Gavin, which is not merged at that moment. I will make
sure it applies to v3.17-rc1.
Post by Bjorn Helgaas
Given the arch/powerpc parts, I'll want an ack from Ben. I just chatted
with him about these, so I assume that's not a problem, but we should make
it explicit.
Bjorn
--
Richard Yang
Help you, Help me
Bjorn Helgaas
2014-08-20 03:12:27 UTC
Permalink
Post by Wei Yang
Post by Bjorn Helgaas
This patch set enables the SRIOV on POWER8.
The gerneral idea is put each VF into one individual PE and allocate required
resources like DMA/MSI.
One thing special for VF PE is we use M64BT to cover the IOV BAR. M64BT is one
hardware on POWER platform to map MMIO address to PE. By using M64BT, we could
map one individual VF to a VF PE, which introduce more flexiblity to users.
To achieve this effect, we need to do some hack on pci devices's resources.
1. Expand the IOV BAR properly.
Done by pnv_pci_ioda_fixup_iov_resources().
2. Shift the IOV BAR properly.
Done by pnv_pci_vf_resource_shift().
3. IOV BAR alignment is the total size instead of an individual size on
powernv platform.
Done by pnv_pcibios_sriov_resource_alignment().
4. Take the IOV BAR alignment into consideration in the sizing and assigning.
This is achieved by commit: "PCI: Take additional IOV BAR alignment in
sizing and assigning"
The SRIOV device tested is Emulex Lancer and Mellanox ConnectX-3 on
POWER8.
1. install necessary modules
modprobe vfio
modprobe vfio-pci
2. retrieve the iommu_group the device belongs to
readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group
../../../../kernel/iommu_groups/26
This means it belongs to group 26
3. see how many devices under this iommu_group
ls /sys/kernel/iommu_groups/26/devices/
4. unbind the original driver and bind to vfio-pci driver
echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
Note: this should be done for each device in the same iommu_group
5. Start qemu and pass device through vfio
/home/ywywyang/git/qemu-impreza/ppc64-softmmu/qemu-system-ppc64 \
-M pseries -m 2048 -enable-kvm -nographic \
-drive file=/home/ywywyang/kvm/fc19.img \
-monitor telnet:localhost:5435,server,nowait -boot cd \
-device "spapr-pci-vfio-host-bridge,id=CXGB3,iommu=26,index=6"
1. ping from a machine in the same subnet(the broadcast domain)
2. run arp -n on this machine
9.115.251.20 ether 00:00:c9:df:ed:bf C eth0
3. ifconfig in the guest
# ifconfig eth1
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 9.115.251.20 netmask 255.255.255.0 broadcast 9.115.251.255
inet6 fe80::200:c9ff:fedf:edbf prefixlen 64 scopeid 0x20<link>
ether 00:00:c9:df:ed:bf txqueuelen 1000 (Ethernet)
RX packets 175 bytes 13278 (12.9 KiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 58 bytes 9276 (9.0 KiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
4. They have the same MAC address
Note: make sure you shutdown other network interfaces in guest.
---
1. add IORESOURCE_ARCH flag for IOV BAR on powernv platform.
2. when IOV BAR has IORESOURCE_ARCH flag, the size is retrieved from
hardware directly. If not, calculate as usual.
PCI, powerpc, powernv
4. rebase it on 3.16-rc6
02:48:57 ~/linux$ stg rebase v3.16-rc6
Checking for changes in the working directory ... done
Rebasing to "v3.16-rc6" ... done
No patches applied
02:49:14 ~/linux$ stg import -M --sign m/wy
Checking for changes in the working directory ... done
Importing patch "pci-iov-export-interface-for" ... done
Importing patch "pci-iov-get-vf-bar-size-from" ... done
Importing patch "pci-add-weak" ... done
Importing patch "pci-take-additional-iov-bar" ... done
Importing patch "powerpc-pci-don-t-unset-pci" ... done
Importing patch "powerpc-pci-define" ... done
Importing patch "powrepc-pci-refactor-pci_dn" ... done
arch/powerpc/platforms/powernv/pci.c:376
error: arch/powerpc/platforms/powernv/pci.c: patch does not apply
stg import: Diff does not apply cleanly
What am I missing?
I assume you intend these all to go through my tree just to keep them all
together. The ideal rebase target for me would be v3.17-rc1.
Ok, I will rebase it on v3.17-rc1 upstream. While I guess the conflict is due
to some patches from Gavin, which is not merged at that moment. I will make
sure it applies to v3.17-rc1.
I tried applying them on v3.16-rc6 as well as on every change to
arch/powerpc/platforms/powernv/pci.c between v3.16-rc6 and v3.17-rc1,
and none applied cleanly. Patches you post should be based on some
upstream tag, not on something that includes unmerged patches.

Bjorn
Wei Yang
2014-08-20 03:35:46 UTC
Permalink
Post by Bjorn Helgaas
Post by Wei Yang
Post by Bjorn Helgaas
This patch set enables the SRIOV on POWER8.
The gerneral idea is put each VF into one individual PE and allocate required
resources like DMA/MSI.
One thing special for VF PE is we use M64BT to cover the IOV BAR. M64BT is one
hardware on POWER platform to map MMIO address to PE. By using M64BT, we could
map one individual VF to a VF PE, which introduce more flexiblity to users.
To achieve this effect, we need to do some hack on pci devices's resources.
1. Expand the IOV BAR properly.
Done by pnv_pci_ioda_fixup_iov_resources().
2. Shift the IOV BAR properly.
Done by pnv_pci_vf_resource_shift().
3. IOV BAR alignment is the total size instead of an individual size on
powernv platform.
Done by pnv_pcibios_sriov_resource_alignment().
4. Take the IOV BAR alignment into consideration in the sizing and assigning.
This is achieved by commit: "PCI: Take additional IOV BAR alignment in
sizing and assigning"
The SRIOV device tested is Emulex Lancer and Mellanox ConnectX-3 on
POWER8.
1. install necessary modules
modprobe vfio
modprobe vfio-pci
2. retrieve the iommu_group the device belongs to
readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group
../../../../kernel/iommu_groups/26
This means it belongs to group 26
3. see how many devices under this iommu_group
ls /sys/kernel/iommu_groups/26/devices/
4. unbind the original driver and bind to vfio-pci driver
echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
Note: this should be done for each device in the same iommu_group
5. Start qemu and pass device through vfio
/home/ywywyang/git/qemu-impreza/ppc64-softmmu/qemu-system-ppc64 \
-M pseries -m 2048 -enable-kvm -nographic \
-drive file=/home/ywywyang/kvm/fc19.img \
-monitor telnet:localhost:5435,server,nowait -boot cd \
-device "spapr-pci-vfio-host-bridge,id=CXGB3,iommu=26,index=6"
1. ping from a machine in the same subnet(the broadcast domain)
2. run arp -n on this machine
9.115.251.20 ether 00:00:c9:df:ed:bf C eth0
3. ifconfig in the guest
# ifconfig eth1
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 9.115.251.20 netmask 255.255.255.0 broadcast 9.115.251.255
inet6 fe80::200:c9ff:fedf:edbf prefixlen 64 scopeid 0x20<link>
ether 00:00:c9:df:ed:bf txqueuelen 1000 (Ethernet)
RX packets 175 bytes 13278 (12.9 KiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 58 bytes 9276 (9.0 KiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
4. They have the same MAC address
Note: make sure you shutdown other network interfaces in guest.
---
1. add IORESOURCE_ARCH flag for IOV BAR on powernv platform.
2. when IOV BAR has IORESOURCE_ARCH flag, the size is retrieved from
hardware directly. If not, calculate as usual.
PCI, powerpc, powernv
4. rebase it on 3.16-rc6
02:48:57 ~/linux$ stg rebase v3.16-rc6
Checking for changes in the working directory ... done
Rebasing to "v3.16-rc6" ... done
No patches applied
02:49:14 ~/linux$ stg import -M --sign m/wy
Checking for changes in the working directory ... done
Importing patch "pci-iov-export-interface-for" ... done
Importing patch "pci-iov-get-vf-bar-size-from" ... done
Importing patch "pci-add-weak" ... done
Importing patch "pci-take-additional-iov-bar" ... done
Importing patch "powerpc-pci-don-t-unset-pci" ... done
Importing patch "powerpc-pci-define" ... done
Importing patch "powrepc-pci-refactor-pci_dn" ... done
arch/powerpc/platforms/powernv/pci.c:376
error: arch/powerpc/platforms/powernv/pci.c: patch does not apply
stg import: Diff does not apply cleanly
What am I missing?
I assume you intend these all to go through my tree just to keep them all
together. The ideal rebase target for me would be v3.17-rc1.
Ok, I will rebase it on v3.17-rc1 upstream. While I guess the conflict is due
to some patches from Gavin, which is not merged at that moment. I will make
sure it applies to v3.17-rc1.
I tried applying them on v3.16-rc6 as well as on every change to
arch/powerpc/platforms/powernv/pci.c between v3.16-rc6 and v3.17-rc1,
and none applied cleanly. Patches you post should be based on some
upstream tag, not on something that includes unmerged patches.
Sorry about this, I will pay attention to this next time.
Post by Bjorn Helgaas
Bjorn
--
Richard Yang
Help you, Help me
Bjorn Helgaas
2014-10-02 15:59:43 UTC
Permalink
Post by Wei Yang
Post by Bjorn Helgaas
Post by Wei Yang
Post by Bjorn Helgaas
This patch set enables the SRIOV on POWER8.
The gerneral idea is put each VF into one individual PE and allocate required
resources like DMA/MSI.
One thing special for VF PE is we use M64BT to cover the IOV BAR. M64BT is one
hardware on POWER platform to map MMIO address to PE. By using M64BT, we could
map one individual VF to a VF PE, which introduce more flexiblity to users.
To achieve this effect, we need to do some hack on pci devices's resources.
1. Expand the IOV BAR properly.
Done by pnv_pci_ioda_fixup_iov_resources().
2. Shift the IOV BAR properly.
Done by pnv_pci_vf_resource_shift().
3. IOV BAR alignment is the total size instead of an individual size on
powernv platform.
Done by pnv_pcibios_sriov_resource_alignment().
4. Take the IOV BAR alignment into consideration in the sizing and assigning.
This is achieved by commit: "PCI: Take additional IOV BAR alignment in
sizing and assigning"
The SRIOV device tested is Emulex Lancer and Mellanox ConnectX-3 on
POWER8.
1. install necessary modules
modprobe vfio
modprobe vfio-pci
2. retrieve the iommu_group the device belongs to
readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group
../../../../kernel/iommu_groups/26
This means it belongs to group 26
3. see how many devices under this iommu_group
ls /sys/kernel/iommu_groups/26/devices/
4. unbind the original driver and bind to vfio-pci driver
echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
Note: this should be done for each device in the same iommu_group
5. Start qemu and pass device through vfio
/home/ywywyang/git/qemu-impreza/ppc64-softmmu/qemu-system-ppc64 \
-M pseries -m 2048 -enable-kvm -nographic \
-drive file=/home/ywywyang/kvm/fc19.img \
-monitor telnet:localhost:5435,server,nowait -boot cd \
-device "spapr-pci-vfio-host-bridge,id=CXGB3,iommu=26,index=6"
1. ping from a machine in the same subnet(the broadcast domain)
2. run arp -n on this machine
9.115.251.20 ether 00:00:c9:df:ed:bf C eth0
3. ifconfig in the guest
# ifconfig eth1
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 9.115.251.20 netmask 255.255.255.0 broadcast 9.115.251.255
inet6 fe80::200:c9ff:fedf:edbf prefixlen 64 scopeid 0x20<link>
ether 00:00:c9:df:ed:bf txqueuelen 1000 (Ethernet)
RX packets 175 bytes 13278 (12.9 KiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 58 bytes 9276 (9.0 KiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
4. They have the same MAC address
Note: make sure you shutdown other network interfaces in guest.
---
1. add IORESOURCE_ARCH flag for IOV BAR on powernv platform.
2. when IOV BAR has IORESOURCE_ARCH flag, the size is retrieved from
hardware directly. If not, calculate as usual.
PCI, powerpc, powernv
4. rebase it on 3.16-rc6
02:48:57 ~/linux$ stg rebase v3.16-rc6
Checking for changes in the working directory ... done
Rebasing to "v3.16-rc6" ... done
No patches applied
02:49:14 ~/linux$ stg import -M --sign m/wy
Checking for changes in the working directory ... done
Importing patch "pci-iov-export-interface-for" ... done
Importing patch "pci-iov-get-vf-bar-size-from" ... done
Importing patch "pci-add-weak" ... done
Importing patch "pci-take-additional-iov-bar" ... done
Importing patch "powerpc-pci-don-t-unset-pci" ... done
Importing patch "powerpc-pci-define" ... done
Importing patch "powrepc-pci-refactor-pci_dn" ... done
arch/powerpc/platforms/powernv/pci.c:376
error: arch/powerpc/platforms/powernv/pci.c: patch does not apply
stg import: Diff does not apply cleanly
What am I missing?
I assume you intend these all to go through my tree just to keep them all
together. The ideal rebase target for me would be v3.17-rc1.
Ok, I will rebase it on v3.17-rc1 upstream. While I guess the conflict is due
to some patches from Gavin, which is not merged at that moment. I will make
sure it applies to v3.17-rc1.
I tried applying them on v3.16-rc6 as well as on every change to
arch/powerpc/platforms/powernv/pci.c between v3.16-rc6 and v3.17-rc1,
and none applied cleanly. Patches you post should be based on some
upstream tag, not on something that includes unmerged patches.
Sorry about this, I will pay attention to this next time.
I haven't seen any more on this series, and I'm assuming you'll post a
rebased series (maybe you're waiting for v3.18-rc1?). I'm just checking to
make sure you're not waiting for something from me...

Bjorn
Gavin Shan
2014-10-02 23:38:23 UTC
Permalink
Post by Bjorn Helgaas
Post by Wei Yang
Post by Bjorn Helgaas
Post by Wei Yang
Post by Bjorn Helgaas
This patch set enables the SRIOV on POWER8.
The gerneral idea is put each VF into one individual PE and allocate required
resources like DMA/MSI.
One thing special for VF PE is we use M64BT to cover the IOV BAR. M64BT is one
hardware on POWER platform to map MMIO address to PE. By using M64BT, we could
map one individual VF to a VF PE, which introduce more flexiblity to users.
To achieve this effect, we need to do some hack on pci devices's resources.
1. Expand the IOV BAR properly.
Done by pnv_pci_ioda_fixup_iov_resources().
2. Shift the IOV BAR properly.
Done by pnv_pci_vf_resource_shift().
3. IOV BAR alignment is the total size instead of an individual size on
powernv platform.
Done by pnv_pcibios_sriov_resource_alignment().
4. Take the IOV BAR alignment into consideration in the sizing and assigning.
This is achieved by commit: "PCI: Take additional IOV BAR alignment in
sizing and assigning"
The SRIOV device tested is Emulex Lancer and Mellanox ConnectX-3 on
POWER8.
1. install necessary modules
modprobe vfio
modprobe vfio-pci
2. retrieve the iommu_group the device belongs to
readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group
../../../../kernel/iommu_groups/26
This means it belongs to group 26
3. see how many devices under this iommu_group
ls /sys/kernel/iommu_groups/26/devices/
4. unbind the original driver and bind to vfio-pci driver
echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
Note: this should be done for each device in the same iommu_group
5. Start qemu and pass device through vfio
/home/ywywyang/git/qemu-impreza/ppc64-softmmu/qemu-system-ppc64 \
-M pseries -m 2048 -enable-kvm -nographic \
-drive file=/home/ywywyang/kvm/fc19.img \
-monitor telnet:localhost:5435,server,nowait -boot cd \
-device "spapr-pci-vfio-host-bridge,id=CXGB3,iommu=26,index=6"
1. ping from a machine in the same subnet(the broadcast domain)
2. run arp -n on this machine
9.115.251.20 ether 00:00:c9:df:ed:bf C eth0
3. ifconfig in the guest
# ifconfig eth1
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 9.115.251.20 netmask 255.255.255.0 broadcast 9.115.251.255
inet6 fe80::200:c9ff:fedf:edbf prefixlen 64 scopeid 0x20<link>
ether 00:00:c9:df:ed:bf txqueuelen 1000 (Ethernet)
RX packets 175 bytes 13278 (12.9 KiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 58 bytes 9276 (9.0 KiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
4. They have the same MAC address
Note: make sure you shutdown other network interfaces in guest.
---
1. add IORESOURCE_ARCH flag for IOV BAR on powernv platform.
2. when IOV BAR has IORESOURCE_ARCH flag, the size is retrieved from
hardware directly. If not, calculate as usual.
PCI, powerpc, powernv
4. rebase it on 3.16-rc6
02:48:57 ~/linux$ stg rebase v3.16-rc6
Checking for changes in the working directory ... done
Rebasing to "v3.16-rc6" ... done
No patches applied
02:49:14 ~/linux$ stg import -M --sign m/wy
Checking for changes in the working directory ... done
Importing patch "pci-iov-export-interface-for" ... done
Importing patch "pci-iov-get-vf-bar-size-from" ... done
Importing patch "pci-add-weak" ... done
Importing patch "pci-take-additional-iov-bar" ... done
Importing patch "powerpc-pci-don-t-unset-pci" ... done
Importing patch "powerpc-pci-define" ... done
Importing patch "powrepc-pci-refactor-pci_dn" ... done
arch/powerpc/platforms/powernv/pci.c:376
error: arch/powerpc/platforms/powernv/pci.c: patch does not apply
stg import: Diff does not apply cleanly
What am I missing?
I assume you intend these all to go through my tree just to keep them all
together. The ideal rebase target for me would be v3.17-rc1.
Ok, I will rebase it on v3.17-rc1 upstream. While I guess the conflict is due
to some patches from Gavin, which is not merged at that moment. I will make
sure it applies to v3.17-rc1.
I tried applying them on v3.16-rc6 as well as on every change to
arch/powerpc/platforms/powernv/pci.c between v3.16-rc6 and v3.17-rc1,
and none applied cleanly. Patches you post should be based on some
upstream tag, not on something that includes unmerged patches.
Sorry about this, I will pay attention to this next time.
I haven't seen any more on this series, and I'm assuming you'll post a
rebased series (maybe you're waiting for v3.18-rc1?). I'm just checking to
make sure you're not waiting for something from me...
Wei Yang is on vacation and he might not see your reply and response in time.
As discussed with Wei Yang offline, he was waiting for 3.18.rc1 to rebase and
send a new version out for comments.

Thanks,
Gavin
Post by Bjorn Helgaas
Bjorn
Wei Yang
2014-10-15 09:00:14 UTC
Permalink
Post by Bjorn Helgaas
Post by Wei Yang
Post by Bjorn Helgaas
Post by Wei Yang
Post by Bjorn Helgaas
This patch set enables the SRIOV on POWER8.
The gerneral idea is put each VF into one individual PE and allocate required
resources like DMA/MSI.
One thing special for VF PE is we use M64BT to cover the IOV BAR. M64BT is one
hardware on POWER platform to map MMIO address to PE. By using M64BT, we could
map one individual VF to a VF PE, which introduce more flexiblity to users.
To achieve this effect, we need to do some hack on pci devices's resources.
1. Expand the IOV BAR properly.
Done by pnv_pci_ioda_fixup_iov_resources().
2. Shift the IOV BAR properly.
Done by pnv_pci_vf_resource_shift().
3. IOV BAR alignment is the total size instead of an individual size on
powernv platform.
Done by pnv_pcibios_sriov_resource_alignment().
4. Take the IOV BAR alignment into consideration in the sizing and assigning.
This is achieved by commit: "PCI: Take additional IOV BAR alignment in
sizing and assigning"
The SRIOV device tested is Emulex Lancer and Mellanox ConnectX-3 on
POWER8.
1. install necessary modules
modprobe vfio
modprobe vfio-pci
2. retrieve the iommu_group the device belongs to
readlink /sys/bus/pci/devices/0000:06:0d.0/iommu_group
../../../../kernel/iommu_groups/26
This means it belongs to group 26
3. see how many devices under this iommu_group
ls /sys/kernel/iommu_groups/26/devices/
4. unbind the original driver and bind to vfio-pci driver
echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind
echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id
Note: this should be done for each device in the same iommu_group
5. Start qemu and pass device through vfio
/home/ywywyang/git/qemu-impreza/ppc64-softmmu/qemu-system-ppc64 \
-M pseries -m 2048 -enable-kvm -nographic \
-drive file=/home/ywywyang/kvm/fc19.img \
-monitor telnet:localhost:5435,server,nowait -boot cd \
-device "spapr-pci-vfio-host-bridge,id=CXGB3,iommu=26,index=6"
1. ping from a machine in the same subnet(the broadcast domain)
2. run arp -n on this machine
9.115.251.20 ether 00:00:c9:df:ed:bf C eth0
3. ifconfig in the guest
# ifconfig eth1
eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500
inet 9.115.251.20 netmask 255.255.255.0 broadcast 9.115.251.255
inet6 fe80::200:c9ff:fedf:edbf prefixlen 64 scopeid 0x20<link>
ether 00:00:c9:df:ed:bf txqueuelen 1000 (Ethernet)
RX packets 175 bytes 13278 (12.9 KiB)
RX errors 0 dropped 0 overruns 0 frame 0
TX packets 58 bytes 9276 (9.0 KiB)
TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0
4. They have the same MAC address
Note: make sure you shutdown other network interfaces in guest.
---
1. add IORESOURCE_ARCH flag for IOV BAR on powernv platform.
2. when IOV BAR has IORESOURCE_ARCH flag, the size is retrieved from
hardware directly. If not, calculate as usual.
PCI, powerpc, powernv
4. rebase it on 3.16-rc6
02:48:57 ~/linux$ stg rebase v3.16-rc6
Checking for changes in the working directory ... done
Rebasing to "v3.16-rc6" ... done
No patches applied
02:49:14 ~/linux$ stg import -M --sign m/wy
Checking for changes in the working directory ... done
Importing patch "pci-iov-export-interface-for" ... done
Importing patch "pci-iov-get-vf-bar-size-from" ... done
Importing patch "pci-add-weak" ... done
Importing patch "pci-take-additional-iov-bar" ... done
Importing patch "powerpc-pci-don-t-unset-pci" ... done
Importing patch "powerpc-pci-define" ... done
Importing patch "powrepc-pci-refactor-pci_dn" ... done
arch/powerpc/platforms/powernv/pci.c:376
error: arch/powerpc/platforms/powernv/pci.c: patch does not apply
stg import: Diff does not apply cleanly
What am I missing?
I assume you intend these all to go through my tree just to keep them all
together. The ideal rebase target for me would be v3.17-rc1.
Ok, I will rebase it on v3.17-rc1 upstream. While I guess the conflict is due
to some patches from Gavin, which is not merged at that moment. I will make
sure it applies to v3.17-rc1.
I tried applying them on v3.16-rc6 as well as on every change to
arch/powerpc/platforms/powernv/pci.c between v3.16-rc6 and v3.17-rc1,
and none applied cleanly. Patches you post should be based on some
upstream tag, not on something that includes unmerged patches.
Sorry about this, I will pay attention to this next time.
I haven't seen any more on this series, and I'm assuming you'll post a
rebased series (maybe you're waiting for v3.18-rc1?). I'm just checking to
make sure you're not waiting for something from me...
Hi, Bjorn

Haven't seen you for a long time :-) I am just back from vocation and the mail
box doesn't work well for previous two days.

Yep, I am rebasing the code on top of v3.17, is this fine for you?
Post by Bjorn Helgaas
Bjorn
--
Richard Yang
Help you, Help me
Bjorn Helgaas
2014-10-15 13:52:16 UTC
Permalink
...
Post by Wei Yang
Post by Bjorn Helgaas
I haven't seen any more on this series, and I'm assuming you'll post a
rebased series (maybe you're waiting for v3.18-rc1?). I'm just checking to
make sure you're not waiting for something from me...
Hi, Bjorn
Haven't seen you for a long time :-) I am just back from vocation and the mail
box doesn't work well for previous two days.
Yep, I am rebasing the code on top of v3.17, is this fine for you?
When I apply your patches, they will be on a branch based on
v3.18-rc1, so the easiest thing for me would be if you generate them
from that base. Here's why:

- Linus released v3.17 on Oct 5, 2014
- Merge window for v3.18 opened when v3.17 released
- All changes intended for v3.18 will be merged during window
(theoretically, at least)
- Merge window closes when Linus releases v3.18-rc1 (probably Oct 19 or 26)

Your changes will miss the v3.18 merge window, so the next chance to
merge them will be during the v3.19 merge window that opens when v3.18
releases. v3.18-rc1 is a close approximation of what v3.18 will
eventually be, so rebasing to v3.18-rc1 will minimize merge conflicts
when we eventually merge your changes on top of v3.18.

v3.18-rc1 isn't out yet, but if you rebase to the current head of
Linus' tree, that should be fairly close. The PCI changes are already
in (80213c03c415), and at least some of the powerpc changes are in
(fd9879b9bb32).

Bjorn
Wei Yang
2014-10-16 08:41:56 UTC
Permalink
Post by Bjorn Helgaas
...
Post by Wei Yang
Post by Bjorn Helgaas
I haven't seen any more on this series, and I'm assuming you'll post a
rebased series (maybe you're waiting for v3.18-rc1?). I'm just checking to
make sure you're not waiting for something from me...
Hi, Bjorn
Haven't seen you for a long time :-) I am just back from vocation and the mail
box doesn't work well for previous two days.
Yep, I am rebasing the code on top of v3.17, is this fine for you?
When I apply your patches, they will be on a branch based on
v3.18-rc1, so the easiest thing for me would be if you generate them
- Linus released v3.17 on Oct 5, 2014
- Merge window for v3.18 opened when v3.17 released
- All changes intended for v3.18 will be merged during window
(theoretically, at least)
- Merge window closes when Linus releases v3.18-rc1 (probably Oct 19 or 26)
Your changes will miss the v3.18 merge window, so the next chance to
merge them will be during the v3.19 merge window that opens when v3.18
releases. v3.18-rc1 is a close approximation of what v3.18 will
eventually be, so rebasing to v3.18-rc1 will minimize merge conflicts
when we eventually merge your changes on top of v3.18.
v3.18-rc1 isn't out yet, but if you rebase to the current head of
Linus' tree, that should be fairly close. The PCI changes are already
in (80213c03c415), and at least some of the powerpc changes are in
(fd9879b9bb32).
Thanks for your explanation. I will do some rebase and test on v3.17. Then
rebase it on v3.18-rc1 when it is released.

And finally give you the one on v3.18-rc1.
Post by Bjorn Helgaas
Bjorn
--
Richard Yang
Help you, Help me
Bjorn Helgaas
2014-08-19 21:37:26 UTC
Permalink
Post by Wei Yang
When implementing the SR-IOV on PowerNV platform, some resource reservation is
needed for VFs which don't exist at the bootup stage. To do the match between
resources and VFs, the code need to get the VF's BDF in advance.
Ben started explaining this whole hardware PE/VF/etc stuff to me, but it
hasn't all sunk in yet. We need to describe it somewhere (it sounds pretty
involved, so maybe an extended description in Documentation/ would be
appropriate).

What I'm concerned about is that PCI resource assignment is a huge mess,
and this obviously complicates it even more. That's necessary and OK, but
I want to at least preserve the possibility that somebody could rework it
to make it manageable, and that means we need to know what the special
constraints of PowerNV are.

Code question below.
Post by Wei Yang
* Make the virtfn_bus as an interface
* Make the virtfn_devfn as an interface
* rename them with more specific name
* code cleanup in pci_sriov_resource_alignment()
---
drivers/pci/iov.c | 26 +++++++-------------------
drivers/pci/pci.h | 19 -------------------
include/linux/pci.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 51 insertions(+), 38 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index cb6f247..7566238 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -19,18 +19,6 @@
#define VIRTFN_ID_LEN 16
-static inline u8 virtfn_bus(struct pci_dev *dev, int id)
-{
- return dev->bus->number + ((dev->devfn + dev->sriov->offset +
- dev->sriov->stride * id) >> 8);
-}
-
-static inline u8 virtfn_devfn(struct pci_dev *dev, int id)
-{
- return (dev->devfn + dev->sriov->offset +
- dev->sriov->stride * id) & 0xff;
-}
-
static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
{
struct pci_bus *child;
@@ -69,7 +57,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
struct pci_bus *bus;
mutex_lock(&iov->dev->sriov->lock);
- bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id));
+ bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
if (!bus)
goto failed;
@@ -77,7 +65,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
if (!virtfn)
goto failed0;
- virtfn->devfn = virtfn_devfn(dev, id);
+ virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
virtfn->vendor = dev->vendor;
pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
pci_setup_device(virtfn);
@@ -140,8 +128,8 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
struct pci_sriov *iov = dev->sriov;
virtfn = pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus),
- virtfn_bus(dev, id),
- virtfn_devfn(dev, id));
+ pci_iov_virtfn_bus(dev, id),
+ pci_iov_virtfn_devfn(dev, id));
if (!virtfn)
return;
@@ -216,7 +204,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
iov->offset = offset;
iov->stride = stride;
- if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->busn_res.end) {
+ if (pci_iov_virtfn_bus(dev, nr_virtfn - 1) > dev->bus->busn_res.end) {
dev_err(&dev->dev, "SR-IOV: bus number out of range\n");
return -ENOMEM;
}
@@ -516,7 +504,7 @@ resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno)
if (!reg)
return 0;
- __pci_read_base(dev, type, &tmp, reg);
+ __pci_read_base(dev, type, &tmp, reg);
return resource_alignment(&tmp);
}
@@ -546,7 +534,7 @@ int pci_iov_bus_range(struct pci_bus *bus)
list_for_each_entry(dev, &bus->devices, bus_list) {
if (!dev->is_physfn)
continue;
- busnr = virtfn_bus(dev, dev->sriov->total_VFs - 1);
+ busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1);
if (busnr > max)
max = busnr;
}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 0601890..a3158b2 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -221,25 +221,6 @@ static inline int pci_ari_enabled(struct pci_bus *bus)
void pci_reassigndev_resource_alignment(struct pci_dev *dev);
void pci_disable_bridge_window(struct pci_dev *dev);
-/* Single Root I/O Virtualization */
-struct pci_sriov {
- int pos; /* capability position */
- int nres; /* number of resources */
- u32 cap; /* SR-IOV Capabilities */
- u16 ctrl; /* SR-IOV Control */
- u16 total_VFs; /* total VFs associated with the PF */
- u16 initial_VFs; /* initial VFs associated with the PF */
- u16 num_VFs; /* number of VFs available */
- u16 offset; /* first VF Routing ID offset */
- u16 stride; /* following VF stride */
- u32 pgsz; /* page size for BAR alignment */
- u8 link; /* Function Dependency Link */
- u16 driver_max_VFs; /* max num VFs driver supports */
- struct pci_dev *dev; /* lowest numbered PF */
- struct pci_dev *self; /* this PF */
- struct mutex lock; /* lock for VF bus */
-};
-
#ifdef CONFIG_PCI_ATS
void pci_restore_ats_state(struct pci_dev *dev);
#else
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 466bcd1..194db52 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -245,6 +245,27 @@ struct pci_vpd;
struct pci_sriov;
struct pci_ats;
+/* Single Root I/O Virtualization */
+struct pci_sriov {
+ int pos; /* capability position */
+ int nres; /* number of resources */
+ u32 cap; /* SR-IOV Capabilities */
+ u16 ctrl; /* SR-IOV Control */
+ u16 total_VFs; /* total VFs associated with the PF */
+ u16 initial_VFs; /* initial VFs associated with the PF */
+ u16 num_VFs; /* number of VFs available */
+ u16 offset; /* first VF Routing ID offset */
+ u16 stride; /* following VF stride */
+ u32 pgsz; /* page size for BAR alignment */
+ u8 link; /* Function Dependency Link */
+ u16 driver_max_VFs; /* max num VFs driver supports */
+ struct pci_dev *dev; /* lowest numbered PF */
+ struct pci_dev *self; /* this PF */
+ struct mutex lock; /* lock for VF bus */
+ struct work_struct mtask; /* VF Migration task */
+ u8 __iomem *mstate; /* VF Migration State Array */
+};
+
/*
* The pci_dev structure is used to describe PCI devices.
*/
@@ -1616,6 +1637,21 @@ int pci_ext_cfg_avail(void);
void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
#ifdef CONFIG_PCI_IOV
+static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id)
+{
+ if (!dev->is_physfn)
+ return -EINVAL;
+ return dev->bus->number + ((dev->devfn + dev->sriov->offset +
+ dev->sriov->stride * id) >> 8);
+}
+static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
+{
+ if (!dev->is_physfn)
+ return -EINVAL;
+ return (dev->devfn + dev->sriov->offset +
+ dev->sriov->stride * id) & 0xff;
+}
Do these really need to be inline? If they weren't inline, we wouldn't
have to move struct pci_sriov to the public header file.
Post by Wei Yang
+
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
void pci_disable_sriov(struct pci_dev *dev);
int pci_num_vf(struct pci_dev *dev);
@@ -1623,6 +1659,14 @@ int pci_vfs_assigned(struct pci_dev *dev);
int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
int pci_sriov_get_totalvfs(struct pci_dev *dev);
#else
+static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id)
+{
+ return -ENXIO;
+}
+static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
+{
+ return -ENXIO;
+}
static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{ return -ENODEV; }
static inline void pci_disable_sriov(struct pci_dev *dev) { }
--
1.7.9.5
Wei Yang
2014-08-20 02:25:20 UTC
Permalink
Post by Bjorn Helgaas
Post by Wei Yang
When implementing the SR-IOV on PowerNV platform, some resource reservation is
needed for VFs which don't exist at the bootup stage. To do the match between
resources and VFs, the code need to get the VF's BDF in advance.
Ben started explaining this whole hardware PE/VF/etc stuff to me, but it
hasn't all sunk in yet. We need to describe it somewhere (it sounds pretty
involved, so maybe an extended description in Documentation/ would be
appropriate).
Yes, this is not that easy to understand the whole stuff. I'd like to write a
file in Documentation/. By scaning the directory, I am not sure which one
would be proper, the Documentation/powerpc/ would be fine?
Post by Bjorn Helgaas
What I'm concerned about is that PCI resource assignment is a huge mess,
and this obviously complicates it even more. That's necessary and OK, but
I want to at least preserve the possibility that somebody could rework it
to make it manageable, and that means we need to know what the special
constraints of PowerNV are.
Sure, let me try my best to explain it, my English is not that good, hope it
is understandable. :-)
Post by Bjorn Helgaas
Code question below.
Post by Wei Yang
* Make the virtfn_bus as an interface
* Make the virtfn_devfn as an interface
* rename them with more specific name
* code cleanup in pci_sriov_resource_alignment()
---
drivers/pci/iov.c | 26 +++++++-------------------
drivers/pci/pci.h | 19 -------------------
include/linux/pci.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 51 insertions(+), 38 deletions(-)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index cb6f247..7566238 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -19,18 +19,6 @@
#define VIRTFN_ID_LEN 16
-static inline u8 virtfn_bus(struct pci_dev *dev, int id)
-{
- return dev->bus->number + ((dev->devfn + dev->sriov->offset +
- dev->sriov->stride * id) >> 8);
-}
-
-static inline u8 virtfn_devfn(struct pci_dev *dev, int id)
-{
- return (dev->devfn + dev->sriov->offset +
- dev->sriov->stride * id) & 0xff;
-}
-
static struct pci_bus *virtfn_add_bus(struct pci_bus *bus, int busnr)
{
struct pci_bus *child;
@@ -69,7 +57,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
struct pci_bus *bus;
mutex_lock(&iov->dev->sriov->lock);
- bus = virtfn_add_bus(dev->bus, virtfn_bus(dev, id));
+ bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
if (!bus)
goto failed;
@@ -77,7 +65,7 @@ static int virtfn_add(struct pci_dev *dev, int id, int reset)
if (!virtfn)
goto failed0;
- virtfn->devfn = virtfn_devfn(dev, id);
+ virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
virtfn->vendor = dev->vendor;
pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_DID, &virtfn->device);
pci_setup_device(virtfn);
@@ -140,8 +128,8 @@ static void virtfn_remove(struct pci_dev *dev, int id, int reset)
struct pci_sriov *iov = dev->sriov;
virtfn = pci_get_domain_bus_and_slot(pci_domain_nr(dev->bus),
- virtfn_bus(dev, id),
- virtfn_devfn(dev, id));
+ pci_iov_virtfn_bus(dev, id),
+ pci_iov_virtfn_devfn(dev, id));
if (!virtfn)
return;
@@ -216,7 +204,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
iov->offset = offset;
iov->stride = stride;
- if (virtfn_bus(dev, nr_virtfn - 1) > dev->bus->busn_res.end) {
+ if (pci_iov_virtfn_bus(dev, nr_virtfn - 1) > dev->bus->busn_res.end) {
dev_err(&dev->dev, "SR-IOV: bus number out of range\n");
return -ENOMEM;
}
@@ -516,7 +504,7 @@ resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno)
if (!reg)
return 0;
- __pci_read_base(dev, type, &tmp, reg);
+ __pci_read_base(dev, type, &tmp, reg);
return resource_alignment(&tmp);
}
@@ -546,7 +534,7 @@ int pci_iov_bus_range(struct pci_bus *bus)
list_for_each_entry(dev, &bus->devices, bus_list) {
if (!dev->is_physfn)
continue;
- busnr = virtfn_bus(dev, dev->sriov->total_VFs - 1);
+ busnr = pci_iov_virtfn_bus(dev, dev->sriov->total_VFs - 1);
if (busnr > max)
max = busnr;
}
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 0601890..a3158b2 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -221,25 +221,6 @@ static inline int pci_ari_enabled(struct pci_bus *bus)
void pci_reassigndev_resource_alignment(struct pci_dev *dev);
void pci_disable_bridge_window(struct pci_dev *dev);
-/* Single Root I/O Virtualization */
-struct pci_sriov {
- int pos; /* capability position */
- int nres; /* number of resources */
- u32 cap; /* SR-IOV Capabilities */
- u16 ctrl; /* SR-IOV Control */
- u16 total_VFs; /* total VFs associated with the PF */
- u16 initial_VFs; /* initial VFs associated with the PF */
- u16 num_VFs; /* number of VFs available */
- u16 offset; /* first VF Routing ID offset */
- u16 stride; /* following VF stride */
- u32 pgsz; /* page size for BAR alignment */
- u8 link; /* Function Dependency Link */
- u16 driver_max_VFs; /* max num VFs driver supports */
- struct pci_dev *dev; /* lowest numbered PF */
- struct pci_dev *self; /* this PF */
- struct mutex lock; /* lock for VF bus */
-};
-
#ifdef CONFIG_PCI_ATS
void pci_restore_ats_state(struct pci_dev *dev);
#else
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 466bcd1..194db52 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -245,6 +245,27 @@ struct pci_vpd;
struct pci_sriov;
struct pci_ats;
+/* Single Root I/O Virtualization */
+struct pci_sriov {
+ int pos; /* capability position */
+ int nres; /* number of resources */
+ u32 cap; /* SR-IOV Capabilities */
+ u16 ctrl; /* SR-IOV Control */
+ u16 total_VFs; /* total VFs associated with the PF */
+ u16 initial_VFs; /* initial VFs associated with the PF */
+ u16 num_VFs; /* number of VFs available */
+ u16 offset; /* first VF Routing ID offset */
+ u16 stride; /* following VF stride */
+ u32 pgsz; /* page size for BAR alignment */
+ u8 link; /* Function Dependency Link */
+ u16 driver_max_VFs; /* max num VFs driver supports */
+ struct pci_dev *dev; /* lowest numbered PF */
+ struct pci_dev *self; /* this PF */
+ struct mutex lock; /* lock for VF bus */
+ struct work_struct mtask; /* VF Migration task */
+ u8 __iomem *mstate; /* VF Migration State Array */
+};
+
/*
* The pci_dev structure is used to describe PCI devices.
*/
@@ -1616,6 +1637,21 @@ int pci_ext_cfg_avail(void);
void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
#ifdef CONFIG_PCI_IOV
+static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id)
+{
+ if (!dev->is_physfn)
+ return -EINVAL;
+ return dev->bus->number + ((dev->devfn + dev->sriov->offset +
+ dev->sriov->stride * id) >> 8);
+}
+static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
+{
+ if (!dev->is_physfn)
+ return -EINVAL;
+ return (dev->devfn + dev->sriov->offset +
+ dev->sriov->stride * id) & 0xff;
+}
Do these really need to be inline? If they weren't inline, we wouldn't
have to move struct pci_sriov to the public header file.
Not necessary to be inline, I think.

I will rework it and hide pci_sriov from public.
Post by Bjorn Helgaas
Post by Wei Yang
+
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
void pci_disable_sriov(struct pci_dev *dev);
int pci_num_vf(struct pci_dev *dev);
@@ -1623,6 +1659,14 @@ int pci_vfs_assigned(struct pci_dev *dev);
int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
int pci_sriov_get_totalvfs(struct pci_dev *dev);
#else
+static inline int pci_iov_virtfn_bus(struct pci_dev *dev, int id)
+{
+ return -ENXIO;
+}
+static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
+{
+ return -ENXIO;
+}
static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{ return -ENODEV; }
static inline void pci_disable_sriov(struct pci_dev *dev) { }
--
1.7.9.5
--
Richard Yang
Help you, Help me
Loading...