Sophie: kernel-2.6.18-128.29.1.el5 src

kernel-2.6.18-128.29.1.el5.src.rpm

From: Brad Peters <bpeters@redhat.com>
Date: Wed, 20 Aug 2008 13:06:03 -0400
Subject: [ppc64] cell: spufs update for RHEL-5.3
Message-id: 48AC4EFB.8060803@redhat.com
O-Subject: Re: [PATCH Set RHEL5.3] Cell spufs update to mainline
Bugzilla: 439483
RH-Acked-by: David Howells <dhowells@redhat.com>

RHBZ#:
======
https://bugzilla.redhat.com/show_bug.cgi?id=439483

Description:
===========
Bug Fix / Power arch only

The spufs for Cell was updated upstream after the cutoff for RHEL5.2. Now we
would like to update the spufs in RHEL5.3 to the most current level to
incorporate bug fixes from upstream.

RHEL Version Found:
================
RHEL 5.2

kABI Status:
============
No symbols were harmed.

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 6d73ae3..03119f1 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -594,7 +594,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
 	mm->context.sllp = SLB_VSID_USER | mmu_psize_defs[MMU_PAGE_4K].sllp;
 	get_paca()->context = mm->context;
 	slb_flush_and_rebolt();
-#ifdef CONFIG_SPU_BASE
+#ifdef CONFIG_SPE_BASE
 	spu_flush_all_slbs(mm);
 #endif
 #endif
@@ -718,7 +718,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 				       "non-cacheable mapping\n");
 				psize = mmu_vmalloc_psize = MMU_PAGE_4K;
 			}
-#ifdef CONFIG_SPU_BASE
+#ifdef CONFIG_SPE_BASE
 			spu_flush_all_slbs(mm);
 #endif
 		}
@@ -726,7 +726,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 			if (psize != get_paca()->context.user_psize) {
 				get_paca()->context = mm->context;
 				slb_flush_and_rebolt();
-#ifdef CONFIG_SPU_BASE
+#ifdef CONFIG_SPE_BASE
 				spu_flush_all_slbs(mm);
 #endif
 			}
@@ -735,7 +735,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
 			get_paca()->vmalloc_sllp =
 				mmu_psize_defs[mmu_vmalloc_psize].sllp;
 			slb_vmalloc_update();
-#ifdef CONFIG_SPU_BASE
+#ifdef CONFIG_SPE_BASE
 			spu_flush_all_slbs(mm);
 #endif
 		}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a515c7d..1b768c9 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -513,8 +513,8 @@ int prepare_hugepage_range(unsigned long addr, unsigned long len)
 	if ((addr + len) > 0x100000000UL)
 		err = open_high_hpage_areas(current->mm,
 					    HTLB_AREA_MASK(addr, len));
-#ifdef CONFIG_SPU_BASE
-	spu_flush_all_slbs(current->mm);
+#ifdef CONFIG_SPE_BASE
+	spu_flush_all_slbs(mm);
 #endif
 	if (err) {
 		printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
diff --git a/arch/powerpc/oprofile/Kconfig b/arch/powerpc/oprofile/Kconfig
index e3eb68f..7089e79 100644
--- a/arch/powerpc/oprofile/Kconfig
+++ b/arch/powerpc/oprofile/Kconfig
@@ -17,7 +17,7 @@ config OPROFILE
 
 config OPROFILE_CELL
 	bool "OProfile for Cell Broadband Engine"
-	depends on (OPROFILE = m || OPROFILE = y)
+	depends on (SPU_FS = y && OPROFILE = m) || (SPU_FS = y && OPROFILE = y) || (SPU_FS = m && OPROFILE = m)
 	default y
 	help
 	  Profiling of Cell BE SPUs requires special support enabled
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
index 08ef78d..c93ae86 100644
--- a/arch/powerpc/oprofile/cell/spu_profiler.c
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -127,7 +127,7 @@ static int cell_spu_pc_collection(int cpu)
 }
 
 
-static enum hrtimer_restart profile_spus(struct hrtimer *timer)
+static int profile_spus(struct hrtimer *timer)
 {
 	ktime_t kt;
 	int cpu, node, k, num_samples, spu_num;
diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig
index 5e20939..4ff987d 100644
--- a/arch/powerpc/platforms/cell/Kconfig
+++ b/arch/powerpc/platforms/cell/Kconfig
@@ -12,6 +12,23 @@ config SPU_FS
 	  Units on machines implementing the Broadband Processor
 	  Architecture.
 
+config SPU_FS_64K_LS
+	bool "Use 64K pages to map SPE local  store"
+	# we depend on PPC_MM_SLICES for now rather than selecting
+	# it because we depend on hugetlbfs hooks being present. We
+	# will fix that when the generic code has been improved to
+	# not require hijacking hugetlbfs hooks.
+	# This is marked as broken as PPC_MM_SLICES is not available
+	depends on SPU_FS && PPC_MM_SLICES && !PPC_64K_PAGES && BROKEN
+	default y
+	# the select clause is disabled until PPC_HAS_HASH_64K gets available
+	# select PPC_HAS_HASH_64K
+	help
+	  This option causes SPE local stores to be mapped in process
+	  address spaces using 64K pages while the rest of the kernel
+	  uses 4K pages. This can improve performances of applications
+	  using multiple SPEs by lowering the TLB pressure on them.
+
 config SPU_BASE
 	bool
 	default n
diff --git a/arch/powerpc/platforms/cell/Makefile b/arch/powerpc/platforms/cell/Makefile
index eb0ea80..314cf9b 100644
--- a/arch/powerpc/platforms/cell/Makefile
+++ b/arch/powerpc/platforms/cell/Makefile
@@ -23,7 +23,7 @@ spu-manage-$(CONFIG_PPC_CELLEB)		+= spu_manage.o
 spu-manage-$(CONFIG_PPC_CELL_NATIVE)	+= spu_manage.o
 obj-$(CONFIG_SPU_BASE)			+= spu_callbacks.o spu_base.o \
 					   spu_notify.o \
-					   spu_syscalls.o \
+					   spu_syscalls.o spu_fault.o \
 					   $(spu-priv1-y) \
 					   $(spu-manage-y) \
 					   spufs/
diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c
index c2c7bc4..bcbf9f8 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -36,6 +36,7 @@
 #include <linux/poll.h>
 #include <asm/spu.h>
 #include <asm/spu_priv1.h>
+#include <asm/spu_csa.h>
 #include <asm/xmon.h>
 #include <asm/prom.h>
 
@@ -49,6 +50,13 @@ struct cbe_spu_info cbe_spu_info[MAX_NUMNODES];
 EXPORT_SYMBOL_GPL(cbe_spu_info);
 
 /*
+ * The spufs fault-handling code needs to call force_sig_info to raise signals
+ * on DMA errors. Export it here to avoid general kernel-wide access to this
+ * function
+ */
+EXPORT_SYMBOL_GPL(force_sig_info);
+
+/*
  * Protects cbe_spu_info and spu->number.
  */
 static DEFINE_SPINLOCK(spu_lock);
@@ -68,6 +76,10 @@ static LIST_HEAD(spu_full_list);
 static DEFINE_SPINLOCK(spu_full_list_lock);
 static DEFINE_MUTEX(spu_full_list_mutex);
 
+struct spu_slb {
+	u64 esid, vsid;
+};
+
 void spu_invalidate_slbs(struct spu *spu)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
@@ -119,68 +131,68 @@ void spu_associate_mm(struct spu *spu, struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(spu_associate_mm);
 
-static int __spu_trap_invalid_dma(struct spu *spu)
+int spu_64k_pages_available(void)
 {
-	pr_debug("%s\n", __FUNCTION__);
-	spu->dma_callback(spu, SPE_EVENT_INVALID_DMA);
-	return 0;
+	return mmu_psize_defs[MMU_PAGE_64K].shift != 0;
 }
+EXPORT_SYMBOL_GPL(spu_64k_pages_available);
 
-static int __spu_trap_dma_align(struct spu *spu)
+static void spu_restart_dma(struct spu *spu)
 {
-	pr_debug("%s\n", __FUNCTION__);
-	spu->dma_callback(spu, SPE_EVENT_DMA_ALIGNMENT);
-	return 0;
-}
+	struct spu_priv2 __iomem *priv2 = spu->priv2;
 
-static int __spu_trap_error(struct spu *spu)
-{
-	pr_debug("%s\n", __FUNCTION__);
-	spu->dma_callback(spu, SPE_EVENT_SPE_ERROR);
-	return 0;
+	if (!test_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags))
+		out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESTART_DMA_COMMAND);
 }
 
-static void spu_restart_dma(struct spu *spu)
+static inline void spu_load_slb(struct spu *spu, int slbe, struct spu_slb *slb)
 {
 	struct spu_priv2 __iomem *priv2 = spu->priv2;
 
-	if (!test_bit(SPU_CONTEXT_SWITCH_PENDING, &spu->flags))
-		out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESTART_DMA_COMMAND);
+	pr_debug("%s: adding SLB[%d] 0x%016lx 0x%016lx\n",
+			__func__, slbe, slb->vsid, slb->esid);
+
+	out_be64(&priv2->slb_index_W, slbe);
+	/* set invalid before writing vsid */
+	out_be64(&priv2->slb_esid_RW, 0);
+	/* now it's safe to write the vsid */
+	out_be64(&priv2->slb_vsid_RW, slb->vsid);
+	/* setting the new esid makes the entry valid again */
+	out_be64(&priv2->slb_esid_RW, slb->esid);
 }
 
 static int __spu_trap_data_seg(struct spu *spu, unsigned long ea)
 {
-	struct spu_priv2 __iomem *priv2 = spu->priv2;
 	struct mm_struct *mm = spu->mm;
-	u64 esid, vsid, llp;
+	struct spu_slb slb;
 	int psize;
 
 	pr_debug("%s\n", __FUNCTION__);
 
-	esid = (ea & ESID_MASK) | SLB_ESID_V;
+	slb.esid = (ea & ESID_MASK) | SLB_ESID_V;
 
 	switch(REGION_ID(ea)) {
 	case USER_REGION_ID:
+#ifdef CONFIG_PPC_MM_SLICES
+		psize = get_slice_psize(mm, ea);
+#else
 		psize = mm->context.user_psize;
-#ifdef CONFIG_HUGETLB_PAGE
-		if (in_hugepage_area(mm->context, ea))
-			psize = mmu_huge_psize;
 #endif
-		vsid = (get_vsid(mm->context.id, ea) << SLB_VSID_SHIFT) |
-				SLB_VSID_USER;
+		slb.vsid = (get_vsid(mm->context.id, ea)
+				<< SLB_VSID_SHIFT) | SLB_VSID_USER;
 		break;
 	case VMALLOC_REGION_ID:
 		if (ea < VMALLOC_END)
 			psize = mmu_vmalloc_psize;
 		else
 			psize = mmu_io_psize;
-		vsid = (get_kernel_vsid(ea) << SLB_VSID_SHIFT) |
-			SLB_VSID_KERNEL;
+		slb.vsid = (get_kernel_vsid(ea)
+				<< SLB_VSID_SHIFT) | SLB_VSID_KERNEL;
 		break;
 	case KERNEL_REGION_ID:
 		psize = mmu_linear_psize;
-		vsid = (get_kernel_vsid(ea) << SLB_VSID_SHIFT) |
-			SLB_VSID_KERNEL;
+		slb.vsid = (get_kernel_vsid(ea)
+				<< SLB_VSID_SHIFT) | SLB_VSID_KERNEL;
 		break;
 	default:
 		/* Future: support kernel segments so that drivers
@@ -189,15 +201,9 @@ static int __spu_trap_data_seg(struct spu *spu, unsigned long ea)
 		pr_debug("invalid region access at %016lx\n", ea);
 		return 1;
 	}
-	llp = mmu_psize_defs[psize].sllp;
+	slb.vsid |= mmu_psize_defs[psize].sllp;
 
-	out_be64(&priv2->slb_index_W, spu->slb_replace);
-	/* set invalid before  writing vsid */
-	out_be64(&priv2->slb_esid_RW, 0);
-	/* now it's safe to write the vsid */
-	out_be64(&priv2->slb_vsid_RW, vsid | llp);
-	/* setting the new esid makes the entry valid again */
-	out_be64(&priv2->slb_esid_RW, esid);
+	spu_load_slb(spu, spu->slb_replace, &slb);
 
 	spu->slb_replace++;
 	if (spu->slb_replace >= 8)
@@ -222,13 +228,85 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr)
 		return 0;
 	}
 
+	spu->class_0_pending = 0;
 	spu->dar = ea;
 	spu->dsisr = dsisr;
-	mb();
+
 	spu->stop_callback(spu);
+
+	return 0;
+}
+
+static void __spu_kernel_slb(void *addr, struct spu_slb *slb)
+{
+	unsigned long ea = (unsigned long)addr;
+	u64 llp;
+
+	if (REGION_ID(ea) == KERNEL_REGION_ID)
+		llp = mmu_psize_defs[mmu_linear_psize].sllp;
+	else
+		llp = mmu_psize_defs[mmu_virtual_psize].sllp;
+
+	slb->vsid = (get_kernel_vsid(ea) << SLB_VSID_SHIFT) |
+		SLB_VSID_KERNEL | llp;
+	slb->esid = (ea & ESID_MASK) | SLB_ESID_V;
+}
+
+/**
+ * Given an array of @nr_slbs SLB entries, @slbs, return non-zero if the
+ * address @new_addr is present.
+ */
+static inline int __slb_present(struct spu_slb *slbs, int nr_slbs,
+		void *new_addr)
+{
+	unsigned long ea = (unsigned long)new_addr;
+	int i;
+
+	for (i = 0; i < nr_slbs; i++)
+		if (!((slbs[i].esid ^ ea) & ESID_MASK))
+			return 1;
+
 	return 0;
 }
 
+/**
+ * Setup the SPU kernel SLBs, in preparation for a context save/restore. We
+ * need to map both the context save area, and the save/restore code.
+ *
+ * Because the lscsa and code may cross segment boundaires, we check to see
+ * if mappings are required for the start and end of each range. We currently
+ * assume that the mappings are smaller that one segment - if not, something
+ * is seriously wrong.
+ */
+void spu_setup_kernel_slbs(struct spu *spu, struct spu_lscsa *lscsa,
+		void *code, int code_size)
+{
+	struct spu_slb slbs[4];
+	int i, nr_slbs = 0;
+	/* start and end addresses of both mappings */
+	void *addrs[] = {
+		lscsa, (void *)lscsa + sizeof(*lscsa) - 1,
+		code, code + code_size - 1
+	};
+
+	/* check the set of addresses, and create a new entry in the slbs array
+	 * if there isn't already a SLB for that address */
+	for (i = 0; i < ARRAY_SIZE(addrs); i++) {
+		if (__slb_present(slbs, nr_slbs, addrs[i]))
+			continue;
+
+		__spu_kernel_slb(addrs[i], &slbs[nr_slbs]);
+		nr_slbs++;
+	}
+
+	spin_lock_irq(&spu->register_lock);
+	/* Add the set of SLBs */
+	for (i = 0; i < nr_slbs; i++)
+		spu_load_slb(spu, i, &slbs[i]);
+	spin_unlock_irq(&spu->register_lock);
+}
+EXPORT_SYMBOL_GPL(spu_setup_kernel_slbs);
+
 static irqreturn_t
 spu_irq_class_0(int irq, void *data, struct pt_regs *regs)
 {
@@ -237,12 +315,13 @@ spu_irq_class_0(int irq, void *data, struct pt_regs *regs)
 
 	spu = data;
 
+	spin_lock(&spu->register_lock);
 	mask = spu_int_mask_get(spu, 0);
-	stat = spu_int_stat_get(spu, 0);
-	stat &= mask;
+	stat = spu_int_stat_get(spu, 0) & mask;
 
-	spin_lock(&spu->register_lock);
-	spu->class_0_pending_value |= stat;
+	spu->class_0_pending |= stat;
+	spu->dsisr = spu_mfc_dsisr_get(spu);
+	spu->dar = spu_mfc_dar_get(spu);
 	spin_unlock(&spu->register_lock);
 
 	spu->stop_callback(spu);
@@ -252,31 +331,6 @@ spu_irq_class_0(int irq, void *data, struct pt_regs *regs)
 	return IRQ_HANDLED;
 }
 
-int
-spu_irq_class_0_bottom(struct spu *spu)
-{
-	unsigned long flags;
-	unsigned long stat;
-
-	spin_lock_irqsave(&spu->register_lock, flags);
-	stat = spu->class_0_pending_value;
-	spu->class_0_pending_value = 0;
-
-	if (stat & 1) /* invalid DMA alignment */
-		__spu_trap_dma_align(spu);
-
-	if (stat & 2) /* invalid MFC DMA */
-		__spu_trap_invalid_dma(spu);
-
-	if (stat & 4) /* error on SPU */
-		__spu_trap_error(spu);
-
-	spin_unlock_irqrestore(&spu->register_lock, flags);
-
-	return (stat & 0x7) ? -EIO : 0;
-}
-EXPORT_SYMBOL_GPL(spu_irq_class_0_bottom);
-
 static irqreturn_t
 spu_irq_class_1(int irq, void *data, struct pt_regs *regs)
 {
@@ -291,25 +345,24 @@ spu_irq_class_1(int irq, void *data, struct pt_regs *regs)
 	stat  = spu_int_stat_get(spu, 1) & mask;
 	dar   = spu_mfc_dar_get(spu);
 	dsisr = spu_mfc_dsisr_get(spu);
-	if (stat & 2) /* mapping fault */
+	if (stat & CLASS1_STORAGE_FAULT_INTR)
 		spu_mfc_dsisr_set(spu, 0ul);
 	spu_int_stat_clear(spu, 1, stat);
-	if (stat & 1) /* segment fault */
+
+	if (stat & CLASS1_SEGMENT_FAULT_INTR)
 		__spu_trap_data_seg(spu, dar);
 
 	spin_unlock(&spu->register_lock);
-
 	pr_debug("%s: %lx %lx %lx %lx\n", __FUNCTION__, mask, stat,
 			dar, dsisr);
 
-	if (stat & 2) { /* mapping fault */
+	if (stat & CLASS1_STORAGE_FAULT_INTR)
 		__spu_trap_data_map(spu, dar, dsisr);
-	}
 
-	if (stat & 4) /* ls compare & suspend on get */
+	if (stat & CLASS1_LS_COMPARE_SUSPEND_ON_GET_INTR)
 		;
 
-	if (stat & 8) /* ls compare & suspend on put */
+	if (stat & CLASS1_LS_COMPARE_SUSPEND_ON_PUT_INTR)
 		;
 
 	return stat ? IRQ_HANDLED : IRQ_NONE;
@@ -321,6 +374,8 @@ spu_irq_class_2(int irq, void *data, struct pt_regs *regs)
 	struct spu *spu;
 	unsigned long stat;
 	unsigned long mask;
+	const int mailbox_intrs =
+		CLASS2_MAILBOX_THRESHOLD_INTR | CLASS2_MAILBOX_INTR;
 
 	spu = data;
 	spin_lock(&spu->register_lock);
@@ -328,31 +383,30 @@ spu_irq_class_2(int irq, void *data, struct pt_regs *regs)
 	mask = spu_int_mask_get(spu, 2);
 	/* ignore interrupts we're not waiting for */
 	stat &= mask;
-	/*
-	 * mailbox interrupts (0x1 and 0x10) are level triggered.
-	 * mask them now before acknowledging.
-	 */
-	if (stat & 0x11)
-		spu_int_mask_and(spu, 2, ~(stat & 0x11));
+
+	/* mailbox interrupts are level triggered. mask them now before
+	 * acknowledging */
+	if (stat & mailbox_intrs)
+		spu_int_mask_and(spu, 2, ~(stat & mailbox_intrs));
 	/* acknowledge all interrupts before the callbacks */
 	spu_int_stat_clear(spu, 2, stat);
 	spin_unlock(&spu->register_lock);
 
 	pr_debug("class 2 interrupt %d, %lx, %lx\n", irq, stat, mask);
 
-	if (stat & 1)  /* PPC core mailbox */
+	if (stat & CLASS2_MAILBOX_INTR)
 		spu->ibox_callback(spu);
 
-	if (stat & 2) /* SPU stop-and-signal */
+	if (stat & CLASS2_SPU_STOP_INTR)
 		spu->stop_callback(spu);
 
-	if (stat & 4) /* SPU halted */
+	if (stat & CLASS2_SPU_HALT_INTR)
 		spu->stop_callback(spu);
 
-	if (stat & 8) /* DMA tag group complete */
+	if (stat & CLASS2_SPU_DMA_TAG_GROUP_COMPLETE_INTR)
 		spu->mfc_callback(spu);
 
-	if (stat & 0x10) /* SPU mailbox threshold */
+	if (stat & CLASS2_MAILBOX_THRESHOLD_INTR)
 		spu->wbox_callback(spu);
 
 	spu->stats.class2_intr++;
@@ -477,13 +531,27 @@ EXPORT_SYMBOL_GPL(spu_add_sysdev_attr);
 int spu_add_sysdev_attr_group(struct attribute_group *attrs)
 {
 	struct spu *spu;
+	int rc = 0;
 
 	mutex_lock(&spu_full_list_mutex);
-	list_for_each_entry(spu, &spu_full_list, full_list)
-		sysfs_create_group(&spu->sysdev.kobj, attrs);
+	list_for_each_entry(spu, &spu_full_list, full_list) {
+		rc = sysfs_create_group(&spu->sysdev.kobj, attrs);
+
+		/* we're in trouble here, but try unwinding anyway */
+		if (rc) {
+			printk(KERN_ERR "%s: can't create sysfs group '%s'\n",
+					__func__, attrs->name);
+
+			list_for_each_entry_continue_reverse(spu,
+					&spu_full_list, full_list)
+				sysfs_remove_group(&spu->sysdev.kobj, attrs);
+			break;
+		}
+	}
+
 	mutex_unlock(&spu_full_list_mutex);
 
-	return 0;
+	return rc;
 }
 EXPORT_SYMBOL_GPL(spu_add_sysdev_attr_group);
 
diff --git a/arch/powerpc/platforms/cell/spu_fault.c b/arch/powerpc/platforms/cell/spu_fault.c
new file mode 100644
index 0000000..90a8a5f
--- /dev/null
+++ b/arch/powerpc/platforms/cell/spu_fault.c
@@ -0,0 +1,98 @@
+/*
+ * SPU mm fault handler
+ *
+ * (C) Copyright IBM Deutschland Entwicklung GmbH 2007
+ *
+ * Author: Arnd Bergmann <arndb@de.ibm.com>
+ * Author: Jeremy Kerr <jk@ozlabs.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <asm/spu.h>
+#include <asm/spu_csa.h>
+
+/*
+ * This ought to be kept in sync with the powerpc specific do_page_fault
+ * function. Currently, there are a few corner cases that we haven't had
+ * to handle fortunately.
+ */
+int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
+		unsigned long dsisr, unsigned *flt)
+{
+	struct vm_area_struct *vma;
+	unsigned long is_write;
+	int ret;
+
+#if 0
+	if (!IS_VALID_EA(ea)) {
+		return -EFAULT;
+	}
+#endif /* XXX */
+	if (mm == NULL) {
+		return -EFAULT;
+	}
+	if (mm->pgd == NULL) {
+		return -EFAULT;
+	}
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, ea);
+	if (!vma)
+		goto bad_area;
+	if (vma->vm_start <= ea)
+		goto good_area;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		goto bad_area;
+	if (expand_stack(vma, ea))
+		goto bad_area;
+good_area:
+	is_write = dsisr & MFC_DSISR_ACCESS_PUT;
+	if (is_write) {
+		if (!(vma->vm_flags & VM_WRITE))
+			goto bad_area;
+	} else {
+		if (dsisr & MFC_DSISR_ACCESS_DENIED)
+			goto bad_area;
+		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+			goto bad_area;
+	}
+	ret = 0;
+	*flt = handle_mm_fault(mm, vma, ea, is_write);
+	if (unlikely(*flt & (VM_FAULT_OOM | VM_FAULT_SIGBUS))) {
+		if (*flt & VM_FAULT_OOM) {
+			ret = -ENOMEM;
+			goto bad_area;
+		} else if (*flt & VM_FAULT_SIGBUS) {
+			ret = -EFAULT;
+			goto bad_area;
+		}
+		BUG();
+	}
+	if (*flt & VM_FAULT_MAJOR)
+		current->maj_flt++;
+	else
+		current->min_flt++;
+	up_read(&mm->mmap_sem);
+	return ret;
+
+bad_area:
+	up_read(&mm->mmap_sem);
+	return -EFAULT;
+}
+EXPORT_SYMBOL_GPL(spu_handle_mm_fault);
diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c
index 0ae89d9..489c70f 100644
--- a/arch/powerpc/platforms/cell/spu_manage.c
+++ b/arch/powerpc/platforms/cell/spu_manage.c
@@ -35,6 +35,7 @@
 #include <asm/firmware.h>
 #include <asm/prom.h>
 
+#include "spufs/spufs.h"
 #include "interrupt.h"
 
 struct device_node *spu_devnode(struct spu *spu)
@@ -345,7 +346,7 @@ static int __init of_create_spu(struct spu *spu, void *data)
 		}
 		ret = spu_map_interrupts_old(spu, spe);
 		if (ret) {
-			printk(KERN_ERR "%s: could not map interrupts",
+			printk(KERN_ERR "%s: could not map interrupts\n",
 				spu->name);
 			goto out_unmap;
 		}
@@ -369,6 +370,16 @@ static int of_destroy_spu(struct spu *spu)
 	return 0;
 }
 
+static void enable_spu_by_master_run(struct spu_context *ctx)
+{
+	ctx->ops->master_start(ctx);
+}
+
+static void disable_spu_by_master_run(struct spu_context *ctx)
+{
+	ctx->ops->master_stop(ctx);
+}
+
 /* Hardcoded affinity idxs for qs20 */
 #define QS20_SPES_PER_BE 8
 static int qs20_reg_idxs[QS20_SPES_PER_BE] =   { 0, 2, 4, 6, 7, 5, 3, 1 };
@@ -377,10 +388,10 @@ static int qs20_reg_memory[QS20_SPES_PER_BE] = { 1, 1, 0, 0, 0, 0, 0, 0 };
 static struct spu *spu_lookup_reg(int node, u32 reg)
 {
 	struct spu *spu;
-	u32 *spu_reg;
+	const u32 *spu_reg;
 
 	list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
-		spu_reg = (u32*)get_property(spu_devnode(spu), "reg", NULL);
+		spu_reg = get_property(spu_devnode(spu), "reg", NULL);
 		if (*spu_reg == reg)
 			return spu;
 	}
@@ -411,10 +422,15 @@ static void init_affinity_qs20_harcoded(void)
 
 static int of_has_vicinity(void)
 {
-	struct spu* spu;
+	struct device_node *dn;
 
-	spu = list_entry(cbe_spu_info[0].spus.next, struct spu, cbe_list);
-	return of_find_property(spu_devnode(spu), "vicinity", NULL) != NULL;
+	for_each_node_by_type(dn, "spe") {
+		if (of_find_property(dn, "vicinity", NULL))  {
+			of_node_put(dn);
+			return 1;
+		}
+	}
+	return 0;
 }
 
 static struct spu *devnode_spu(int cbe, struct device_node *dn)
@@ -525,7 +541,7 @@ static int __init init_affinity(void)
 		if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0"))
 			init_affinity_qs20_harcoded();
 		else
-			printk("No affinity configuration found");
+			printk("No affinity configuration found\n");
 	}
 
 	return 0;
@@ -535,5 +551,7 @@ const struct spu_management_ops spu_management_of_ops = {
 	.enumerate_spus = of_enumerate_spus,
 	.create_spu = of_create_spu,
 	.destroy_spu = of_destroy_spu,
+	.enable_spu = enable_spu_by_master_run,
+	.disable_spu = disable_spu_by_master_run,
 	.init_affinity = init_affinity,
 };
diff --git a/arch/powerpc/platforms/cell/spufs/Makefile b/arch/powerpc/platforms/cell/spufs/Makefile
index f56940d..d3a349f 100644
--- a/arch/powerpc/platforms/cell/spufs/Makefile
+++ b/arch/powerpc/platforms/cell/spufs/Makefile
@@ -1,8 +1,8 @@
-obj-y += switch.o lscsa_alloc.o fault.o
 
 obj-$(CONFIG_SPU_FS) += spufs.o
 spufs-y += inode.o file.o context.o syscalls.o coredump.o
 spufs-y += sched.o backing_ops.o hw_ops.o run.o gang.o
+spufs-y += switch.o fault.o lscsa_alloc.o
 
 # Rules to build switch.o with the help of SPU tool chain
 SPU_CROSS	:= spu-
diff --git a/arch/powerpc/platforms/cell/spufs/backing_ops.c b/arch/powerpc/platforms/cell/spufs/backing_ops.c
index ec01214..64eb15b 100644
--- a/arch/powerpc/platforms/cell/spufs/backing_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/backing_ops.c
@@ -106,16 +106,20 @@ static unsigned int spu_backing_mbox_stat_poll(struct spu_context *ctx,
 		if (stat & 0xff0000)
 			ret |= POLLIN | POLLRDNORM;
 		else {
-			ctx->csa.priv1.int_stat_class0_RW &= ~0x1;
-			ctx->csa.priv1.int_mask_class2_RW |= 0x1;
+			ctx->csa.priv1.int_stat_class2_RW &=
+				~CLASS2_MAILBOX_INTR;
+			ctx->csa.priv1.int_mask_class2_RW |=
+				CLASS2_ENABLE_MAILBOX_INTR;
 		}
 	}
 	if (events & (POLLOUT | POLLWRNORM)) {
 		if (stat & 0x00ff00)
 			ret = POLLOUT | POLLWRNORM;
 		else {
-			ctx->csa.priv1.int_stat_class0_RW &= ~0x10;
-			ctx->csa.priv1.int_mask_class2_RW |= 0x10;
+			ctx->csa.priv1.int_stat_class2_RW &=
+				~CLASS2_MAILBOX_THRESHOLD_INTR;
+			ctx->csa.priv1.int_mask_class2_RW |=
+				CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR;
 		}
 	}
 	spin_unlock_irq(&ctx->csa.register_lock);
@@ -139,7 +143,7 @@ static int spu_backing_ibox_read(struct spu_context *ctx, u32 * data)
 		ret = 4;
 	} else {
 		/* make sure we get woken up by the interrupt */
-		ctx->csa.priv1.int_mask_class2_RW |= 0x1UL;
+		ctx->csa.priv1.int_mask_class2_RW |= CLASS2_ENABLE_MAILBOX_INTR;
 		ret = 0;
 	}
 	spin_unlock(&ctx->csa.register_lock);
@@ -169,7 +173,8 @@ static int spu_backing_wbox_write(struct spu_context *ctx, u32 data)
 	} else {
 		/* make sure we get woken up by the interrupt when space
 		   becomes available */
-		ctx->csa.priv1.int_mask_class2_RW |= 0x10;
+		ctx->csa.priv1.int_mask_class2_RW |=
+			CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR;
 		ret = 0;
 	}
 	spin_unlock(&ctx->csa.register_lock);
@@ -268,6 +273,11 @@ static char *spu_backing_get_ls(struct spu_context *ctx)
 	return ctx->csa.lscsa->ls;
 }
 
+static void spu_backing_privcntl_write(struct spu_context *ctx, u64 val)
+{
+	ctx->csa.priv2.spu_privcntl_RW = val;
+}
+
 static u32 spu_backing_runcntl_read(struct spu_context *ctx)
 {
 	return ctx->csa.prob.spu_runcntl_RW;
@@ -278,6 +288,12 @@ static void spu_backing_runcntl_write(struct spu_context *ctx, u32 val)
 	spin_lock(&ctx->csa.register_lock);
 	ctx->csa.prob.spu_runcntl_RW = val;
 	if (val & SPU_RUNCNTL_RUNNABLE) {
+		ctx->csa.prob.spu_status_R &=
+			~SPU_STATUS_STOPPED_BY_STOP &
+			~SPU_STATUS_STOPPED_BY_HALT &
+			~SPU_STATUS_SINGLE_STEP &
+			~SPU_STATUS_INVALID_INSTR &
+			~SPU_STATUS_INVALID_CH;
 		ctx->csa.prob.spu_status_R |= SPU_STATUS_RUNNING;
 	} else {
 		ctx->csa.prob.spu_status_R &= ~SPU_STATUS_RUNNING;
@@ -285,6 +301,11 @@ static void spu_backing_runcntl_write(struct spu_context *ctx, u32 val)
 	spin_unlock(&ctx->csa.register_lock);
 }
 
+static void spu_backing_runcntl_stop(struct spu_context *ctx)
+{
+	spu_backing_runcntl_write(ctx, SPU_RUNCNTL_STOP);
+}
+
 static void spu_backing_master_start(struct spu_context *ctx)
 {
 	struct spu_state *csa = &ctx->csa;
@@ -358,7 +379,7 @@ static int spu_backing_send_mfc_command(struct spu_context *ctx,
 
 static void spu_backing_restart_dma(struct spu_context *ctx)
 {
-	/* nothing to do here */
+	ctx->csa.priv2.mfc_control_RW |= MFC_CNTL_RESTART_DMA_COMMAND;
 }
 
 struct spu_context_ops spu_backing_ops = {
@@ -379,8 +400,10 @@ struct spu_context_ops spu_backing_ops = {
 	.npc_write = spu_backing_npc_write,
 	.status_read = spu_backing_status_read,
 	.get_ls = spu_backing_get_ls,
+	.privcntl_write = spu_backing_privcntl_write,
 	.runcntl_read = spu_backing_runcntl_read,
 	.runcntl_write = spu_backing_runcntl_write,
+	.runcntl_stop = spu_backing_runcntl_stop,
 	.master_start = spu_backing_master_start,
 	.master_stop = spu_backing_master_stop,
 	.set_mfc_query = spu_backing_set_mfc_query,
diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c
index adf0a03..0ad83ae 100644
--- a/arch/powerpc/platforms/cell/spufs/context.c
+++ b/arch/powerpc/platforms/cell/spufs/context.c
@@ -52,6 +52,7 @@ struct spu_context *alloc_spu_context(struct spu_gang *gang)
 	init_waitqueue_head(&ctx->wbox_wq);
 	init_waitqueue_head(&ctx->stop_wq);
 	init_waitqueue_head(&ctx->mfc_wq);
+	init_waitqueue_head(&ctx->run_wq);
 	ctx->state = SPU_STATE_SAVED;
 	ctx->ops = &spu_backing_ops;
 	ctx->owner = get_task_mm(current);
@@ -105,7 +106,16 @@ int put_spu_context(struct spu_context *ctx)
 void spu_forget(struct spu_context *ctx)
 {
 	struct mm_struct *mm;
-	spu_acquire_saved(ctx);
+
+	/*
+	 * This is basically an open-coded spu_acquire_saved, except that
+	 * we don't acquire the state mutex interruptible, and we don't
+	 * want this context to be rescheduled on release.
+	 */
+	mutex_lock(&ctx->state_mutex);
+	if (ctx->state != SPU_STATE_SAVED)
+		spu_deactivate(ctx);
+
 	mm = ctx->owner;
 	ctx->owner = NULL;
 	mmput(mm);
@@ -133,47 +143,23 @@ void spu_unmap_mappings(struct spu_context *ctx)
 }
 
 /**
- * spu_acquire_runnable - lock spu contex and make sure it is in runnable state
+ * spu_acquire_saved - lock spu contex and make sure it is in saved state
  * @ctx:	spu contex to lock
- *
- * Note:
- *	Returns 0 and with the context locked on success
- *	Returns negative error and with the context _unlocked_ on failure.
  */
-int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags)
+int spu_acquire_saved(struct spu_context *ctx)
 {
-	int ret = -EINVAL;
-
-	spu_acquire(ctx);
-	if (ctx->state == SPU_STATE_SAVED) {
-		/*
-		 * Context is about to be freed, so we can't acquire it anymore.
-		 */
-		if (!ctx->owner)
-			goto out_unlock;
-		ret = spu_activate(ctx, flags);
-		if (ret)
-			goto out_unlock;
-	}
+	int ret;
 
-	return 0;
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 
- out_unlock:
-	spu_release(ctx);
-	return ret;
-}
-
-/**
- * spu_acquire_saved - lock spu contex and make sure it is in saved state
- * @ctx:	spu contex to lock
- */
-void spu_acquire_saved(struct spu_context *ctx)
-{
-	spu_acquire(ctx);
 	if (ctx->state != SPU_STATE_SAVED) {
 		set_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags);
 		spu_deactivate(ctx);
 	}
+
+	return 0;
 }
 
 /**
@@ -184,7 +170,8 @@ void spu_release_saved(struct spu_context *ctx)
 {
 	BUG_ON(ctx->state != SPU_STATE_SAVED);
 
-	if (test_and_clear_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags))
+	if (test_and_clear_bit(SPU_SCHED_WAS_ACTIVE, &ctx->sched_flags) &&
+			test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
 		spu_activate(ctx, 0);
 
 	spu_release(ctx);
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 80f6236..0c6a96b 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -148,7 +148,9 @@ int spufs_coredump_extra_notes_size(void)
 
 	fd = 0;
 	while ((ctx = coredump_next_context(&fd)) != NULL) {
-		spu_acquire_saved(ctx);
+		rc = spu_acquire_saved(ctx);
+		if (rc)
+			break;
 		rc = spufs_ctx_note_size(ctx, fd);
 		spu_release_saved(ctx);
 		if (rc < 0)
@@ -224,7 +226,9 @@ int spufs_coredump_extra_notes_write(struct file *file, loff_t *foffset)
 
 	fd = 0;
 	while ((ctx = coredump_next_context(&fd)) != NULL) {
-		spu_acquire_saved(ctx);
+		rc = spu_acquire_saved(ctx);
+		if (rc)
+			return rc;
 
 		for (j = 0; spufs_coredump_read[j].name != NULL; j++) {
 			rc = spufs_arch_write_note(ctx, j, file, fd, foffset);
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c
index 87648fe..e46d300 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -28,119 +28,71 @@
 
 #include "spufs.h"
 
-/*
- * This ought to be kept in sync with the powerpc specific do_page_fault
- * function. Currently, there are a few corner cases that we haven't had
- * to handle fortunately.
+/**
+ * Handle an SPE event, depending on context SPU_CREATE_EVENTS_ENABLED flag.
+ *
+ * If the context was created with events, we just set the return event.
+ * Otherwise, send an appropriate signal to the process.
  */
-static int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
-		unsigned long dsisr, unsigned *flt)
+static void spufs_handle_event(struct spu_context *ctx,
+				unsigned long ea, int type)
 {
-	struct vm_area_struct *vma;
-	unsigned long is_write;
-	int ret;
+	siginfo_t info;
 
-#if 0
-	if (!IS_VALID_EA(ea)) {
-		return -EFAULT;
-	}
-#endif /* XXX */
-	if (mm == NULL) {
-		return -EFAULT;
-	}
-	if (mm->pgd == NULL) {
-		return -EFAULT;
+	if (ctx->flags & SPU_CREATE_EVENTS_ENABLED) {
+		ctx->event_return |= type;
+		wake_up_all(&ctx->stop_wq);
+		return;
 	}
 
-	down_read(&mm->mmap_sem);
-	vma = find_vma(mm, ea);
-	if (!vma)
-		goto bad_area;
-	if (vma->vm_start <= ea)
-		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
-	if (expand_stack(vma, ea))
-		goto bad_area;
-good_area:
-	is_write = dsisr & MFC_DSISR_ACCESS_PUT;
-	if (is_write) {
-		if (!(vma->vm_flags & VM_WRITE))
-			goto bad_area;
-	} else {
-		if (dsisr & MFC_DSISR_ACCESS_DENIED)
-			goto bad_area;
-		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
-			goto bad_area;
-	}
-	ret = 0;
+	memset(&info, 0, sizeof(info));
 
-	switch (handle_mm_fault(mm, vma, ea, is_write)) {
-	case VM_FAULT_MINOR:
-		current->min_flt++;
+	switch (type) {
+	case SPE_EVENT_INVALID_DMA:
+		info.si_signo = SIGBUS;
+		info.si_code = BUS_OBJERR;
+		break;
+	case SPE_EVENT_SPE_DATA_STORAGE:
+		info.si_signo = SIGSEGV;
+		info.si_addr = (void __user *)ea;
+		info.si_code = SEGV_ACCERR;
+		ctx->ops->restart_dma(ctx);
 		break;
-	case VM_FAULT_MAJOR:
-		current->maj_flt++;
+	case SPE_EVENT_DMA_ALIGNMENT:
+		info.si_signo = SIGBUS;
+		/* DAR isn't set for an alignment fault :( */
+		info.si_code = BUS_ADRALN;
+		break;
+	case SPE_EVENT_SPE_ERROR:
+		info.si_signo = SIGILL;
+		info.si_addr = (void __user *)(unsigned long)
+			ctx->ops->npc_read(ctx) - 4;
+		info.si_code = ILL_ILLOPC;
 		break;
-	case VM_FAULT_SIGBUS:
-		ret = -EFAULT;
-		goto bad_area;
-	case VM_FAULT_OOM:
-		ret = -ENOMEM;
-		goto bad_area;
-	default:
-		BUG();
 	}
-	up_read(&mm->mmap_sem);
-	return ret;
 
-bad_area:
-	up_read(&mm->mmap_sem);
-	return -EFAULT;
+	if (info.si_signo)
+		force_sig_info(info.si_signo, &info, current);
 }
 
-static void spufs_handle_dma_error(struct spu_context *ctx,
-				unsigned long ea, int type)
+int spufs_handle_class0(struct spu_context *ctx)
 {
-	if (ctx->flags & SPU_CREATE_EVENTS_ENABLED) {
-		ctx->event_return |= type;
-		wake_up_all(&ctx->stop_wq);
-	} else {
-		siginfo_t info;
-		memset(&info, 0, sizeof(info));
-
-		switch (type) {
-		case SPE_EVENT_INVALID_DMA:
-			info.si_signo = SIGBUS;
-			info.si_code = BUS_OBJERR;
-			break;
-		case SPE_EVENT_SPE_DATA_STORAGE:
-			info.si_signo = SIGBUS;
-			info.si_addr = (void __user *)ea;
-			info.si_code = BUS_ADRERR;
-			break;
-		case SPE_EVENT_DMA_ALIGNMENT:
-			info.si_signo = SIGBUS;
-			/* DAR isn't set for an alignment fault :( */
-			info.si_code = BUS_ADRALN;
-			break;
-		case SPE_EVENT_SPE_ERROR:
-			info.si_signo = SIGILL;
-			info.si_addr = (void __user *)(unsigned long)
-				ctx->ops->npc_read(ctx) - 4;
-			info.si_code = ILL_ILLOPC;
-			break;
-		}
-		if (info.si_signo)
-			force_sig_info(info.si_signo, &info, current);
-	}
-}
+	unsigned long stat = ctx->csa.class_0_pending & CLASS0_INTR_MASK;
 
-void spufs_dma_callback(struct spu *spu, int type)
-{
-	spufs_handle_dma_error(spu->ctx, spu->dar, type);
+	if (likely(!stat))
+		return 0;
+
+	if (stat & CLASS0_DMA_ALIGNMENT_INTR)
+		spufs_handle_event(ctx, ctx->csa.dar, SPE_EVENT_DMA_ALIGNMENT);
+
+	if (stat & CLASS0_INVALID_DMA_COMMAND_INTR)
+		spufs_handle_event(ctx, ctx->csa.dar, SPE_EVENT_INVALID_DMA);
+
+	if (stat & CLASS0_SPU_ERROR_INTR)
+		spufs_handle_event(ctx, ctx->csa.dar, SPE_EVENT_SPE_ERROR);
+
+	return -EIO;
 }
-EXPORT_SYMBOL_GPL(spufs_dma_callback);
 
 /*
  * bottom half handler for page faults, we can't do this from
@@ -167,16 +119,8 @@ int spufs_handle_class1(struct spu_context *ctx)
 	 * in time, we can still expect to get the same fault
 	 * the immediately after the context restore.
 	 */
-	if (ctx->state == SPU_STATE_RUNNABLE) {
-		ea = ctx->spu->dar;
-		dsisr = ctx->spu->dsisr;
-		ctx->spu->dar= ctx->spu->dsisr = 0;
-	} else {
-		ea = ctx->csa.priv1.mfc_dar_RW;
-		dsisr = ctx->csa.priv1.mfc_dsisr_RW;
-		ctx->csa.priv1.mfc_dar_RW = 0;
-		ctx->csa.priv1.mfc_dsisr_RW = 0;
-	}
+	ea = ctx->csa.dar;
+	dsisr = ctx->csa.dsisr;
 
 	if (!(dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED)))
 		return 0;
@@ -203,7 +147,19 @@ int spufs_handle_class1(struct spu_context *ctx)
 	if (ret)
 		ret = spu_handle_mm_fault(current->mm, ea, dsisr, &flt);
 
-	spu_acquire(ctx);
+	/*
+	 * This is nasty: we need the state_mutex for all the bookkeeping even
+	 * if the syscall was interrupted by a signal. ewww.
+	 */
+	mutex_lock(&ctx->state_mutex);
+
+	/*
+	 * Clear dsisr under ctxt lock after handling the fault, so that
+	 * time slicing will not preempt the context while the page fault
+	 * handler is running. Context switch code removes mappings.
+	 */
+	ctx->csa.dar = ctx->csa.dsisr = 0;
+
 	/*
 	 * If we handled the fault successfully and are in runnable
 	 * state, restart the DMA.
@@ -224,9 +180,8 @@ int spufs_handle_class1(struct spu_context *ctx)
 		if (ctx->spu)
 			ctx->ops->restart_dma(ctx);
 	} else
-		spufs_handle_dma_error(ctx, ea, SPE_EVENT_SPE_DATA_STORAGE);
+		spufs_handle_event(ctx, ea, SPE_EVENT_SPE_DATA_STORAGE);
 
 	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(spufs_handle_class1);
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 6eb9c94..db03afa 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -40,6 +40,120 @@
 
 #define SPUFS_MMAP_4K (PAGE_SIZE == 0x1000)
 
+/* Simple attribute files */
+struct spufs_attr {
+	int (*get)(void *, u64 *);
+	int (*set)(void *, u64);
+	char get_buf[24];       /* enough to store a u64 and "\n\0" */
+	char set_buf[24];
+	void *data;
+	const char *fmt;        /* format for read operation */
+	struct mutex mutex;     /* protects access to these buffers */
+};
+
+static int spufs_attr_open(struct inode *inode, struct file *file,
+		int (*get)(void *, u64 *), int (*set)(void *, u64),
+		const char *fmt)
+{
+	struct spufs_attr *attr;
+
+	attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	attr->get = get;
+	attr->set = set;
+	attr->data = inode->i_private;
+	attr->fmt = fmt;
+	mutex_init(&attr->mutex);
+	file->private_data = attr;
+
+	return nonseekable_open(inode, file);
+}
+
+static int spufs_attr_release(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t spufs_attr_read(struct file *file, char __user *buf,
+		size_t len, loff_t *ppos)
+{
+	struct spufs_attr *attr;
+	size_t size;
+	ssize_t ret;
+
+	attr = file->private_data;
+	if (!attr->get)
+		return -EACCES;
+
+	ret = mutex_lock_interruptible(&attr->mutex);
+	if (ret)
+		return ret;
+
+	if (*ppos) {		/* continued read */
+		size = strlen(attr->get_buf);
+	} else {		/* first read */
+		u64 val;
+		ret = attr->get(attr->data, &val);
+		if (ret)
+			goto out;
+
+		size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
+				 attr->fmt, (unsigned long long)val);
+	}
+
+	ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
+out:
+	mutex_unlock(&attr->mutex);
+	return ret;
+}
+
+static ssize_t spufs_attr_write(struct file *file, const char __user *buf,
+		size_t len, loff_t *ppos)
+{
+	struct spufs_attr *attr;
+	u64 val;
+	size_t size;
+	ssize_t ret;
+
+	attr = file->private_data;
+	if (!attr->set)
+		return -EACCES;
+
+	ret = mutex_lock_interruptible(&attr->mutex);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
+	size = min(sizeof(attr->set_buf) - 1, len);
+	if (copy_from_user(attr->set_buf, buf, size))
+		goto out;
+
+	ret = len; /* claim we got the whole input */
+	attr->set_buf[size] = '\0';
+	val = simple_strtol(attr->set_buf, NULL, 0);
+	attr->set(attr->data, val);
+out:
+	mutex_unlock(&attr->mutex);
+	return ret;
+}
+
+#define DEFINE_SPUFS_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)	\
+static int __fops ## _open(struct inode *inode, struct file *file)	\
+{									\
+	__simple_attr_check_format(__fmt, 0ull);			\
+	return spufs_attr_open(inode, file, __get, __set, __fmt);	\
+}									\
+static struct file_operations __fops = {				\
+	.owner	 = THIS_MODULE,						\
+	.open	 = __fops ## _open,					\
+	.release = spufs_attr_release,					\
+	.read	 = spufs_attr_read,					\
+	.write	 = spufs_attr_write,					\
+};
+
 
 static int
 spufs_mem_open(struct inode *inode, struct file *file)
@@ -84,9 +198,12 @@ spufs_mem_read(struct file *file, char __user *buffer,
 	struct spu_context *ctx = file->private_data;
 	ssize_t ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_mem_read(ctx, buffer, size, pos);
 	spu_release(ctx);
+
 	return ret;
 }
 
@@ -106,7 +223,10 @@ spufs_mem_write(struct file *file, const char __user *buffer,
 	if (size > LS_SIZE - pos)
 		size = LS_SIZE - pos;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
+
 	local_store = ctx->ops->get_ls(ctx);
 	ret = copy_from_user(local_store + pos, buffer, size);
 	spu_release(ctx);
@@ -121,16 +241,33 @@ static unsigned long spufs_mem_mmap_nopfn(struct vm_area_struct *vma,
 					  unsigned long address)
 {
 	struct spu_context *ctx	= vma->vm_file->private_data;
-	unsigned long pfn, offset;
+	unsigned long pfn, offset, addr0 = address;
+#ifdef CONFIG_SPU_FS_64K_LS
+	struct spu_state *csa = &ctx->csa;
+	int psize;
+
+	/* Check what page size we are using */
+	psize = get_slice_psize(vma->vm_mm, address);
+
+	/* Some sanity checking */
+	BUG_ON(csa->use_big_pages != (psize == MMU_PAGE_64K));
+
+	/* Wow, 64K, cool, we need to align the address though */
+	if (csa->use_big_pages) {
+		BUG_ON(vma->vm_start & 0xffff);
+		address &= ~0xfffful;
+	}
+#endif /* CONFIG_SPU_FS_64K_LS */
 
 	offset = (address - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
 	if (offset >= LS_SIZE)
 		return NOPFN_SIGBUS;
 
-	pr_debug("spufs_mem_mmap_nopfn address=0x%lx, offset=0x%lx\n",
-		 address, offset);
+	pr_debug("spufs_mem_mmap_nopfn address=0x%lx -> 0x%lx, offset=0x%lx\n",
+		 addr0, address, offset);
 
-	spu_acquire(ctx);
+	if (spu_acquire(ctx))
+		return NOPFN_REFAULT;
 
 	if (ctx->state == SPU_STATE_SAVED) {
 		vma->vm_page_prot = __pgprot(pgprot_val(vma->vm_page_prot)
@@ -155,6 +292,22 @@ static struct vm_operations_struct spufs_mem_mmap_vmops = {
 
 static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma)
 {
+#ifdef CONFIG_SPU_FS_64K_LS
+	struct spu_context	*ctx = file->private_data;
+	struct spu_state	*csa = &ctx->csa;
+
+	/* Sanity check VMA alignment */
+	if (csa->use_big_pages) {
+		pr_debug("spufs_mem_mmap 64K, start=0x%lx, end=0x%lx,"
+			 " pgoff=0x%lx\n", vma->vm_start, vma->vm_end,
+			 vma->vm_pgoff);
+		if (vma->vm_start & 0xffff)
+			return -EINVAL;
+		if (vma->vm_pgoff & 0xf)
+			return -EINVAL;
+	}
+#endif /* CONFIG_SPU_FS_64K_LS */
+
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
@@ -166,6 +319,25 @@ static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+#ifdef CONFIG_SPU_FS_64K_LS
+unsigned long spufs_get_unmapped_area(struct file *file, unsigned long addr,
+				      unsigned long len, unsigned long pgoff,
+				      unsigned long flags)
+{
+	struct spu_context	*ctx = file->private_data;
+	struct spu_state	*csa = &ctx->csa;
+
+	/* If not using big pages, fallback to normal MM g_u_a */
+	if (!csa->use_big_pages)
+		return current->mm->get_unmapped_area(file, addr, len,
+						      pgoff, flags);
+
+	/* Else, try to obtain a 64K pages slice */
+	return slice_get_unmapped_area(addr, len, flags,
+				       MMU_PAGE_64K, 1, 0);
+}
+#endif /* CONFIG_SPU_FS_64K_LS */
+
 static const struct file_operations spufs_mem_fops = {
 	.open			= spufs_mem_open,
 	.release		= spufs_mem_release,
@@ -173,6 +345,9 @@ static const struct file_operations spufs_mem_fops = {
 	.write			= spufs_mem_write,
 	.llseek			= generic_file_llseek,
 	.mmap			= spufs_mem_mmap,
+#ifdef CONFIG_SPU_FS_64K_LS
+	.get_unmapped_area	= spufs_get_unmapped_area,
+#endif
 };
 
 static unsigned long spufs_ps_nopfn(struct vm_area_struct *vma,
@@ -182,23 +357,44 @@ static unsigned long spufs_ps_nopfn(struct vm_area_struct *vma,
 {
 	struct spu_context *ctx = vma->vm_file->private_data;
 	unsigned long area, offset = address - vma->vm_start;
-	int ret;
+	int ret = 0;
 
 	offset += vma->vm_pgoff << PAGE_SHIFT;
 	if (offset >= ps_size)
 		return NOPFN_SIGBUS;
 
-	/* error here usually means a signal.. we might want to test
-	 * the error code more precisely though
+	/*
+	 * Because we release the mmap_sem, the context may be destroyed while
+	 * we're in spu_wait. Grab an extra reference so it isn't destroyed
+	 * in the meantime.
 	 */
-	ret = spu_acquire_runnable(ctx, 0);
-	if (ret)
-		return NOPFN_REFAULT;
+	get_spu_context(ctx);
 
-	area = ctx->spu->problem_phys + ps_offs;
-	vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT);
-	spu_release(ctx);
+	/*
+	 * We have to wait for context to be loaded before we have
+	 * pages to hand out to the user, but we don't want to wait
+	 * with the mmap_sem held.
+	 * It is possible to drop the mmap_sem here, but then we need
+	 * to return NOPFN_REFAULT because the mappings may have
+	 * hanged.
+	 */
+	if (spu_acquire(ctx))
+		goto refault;
+
+	if (ctx->state == SPU_STATE_SAVED) {
+		up_read(&current->mm->mmap_sem);
+		ret = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
+		down_read(&current->mm->mmap_sem);
+	} else {
+		area = ctx->spu->problem_phys + ps_offs;
+		vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT);
+	}
+
+	if (!ret)
+		spu_release(ctx);
 
+refault:
+	put_spu_context(ctx);
 	return NOPFN_REFAULT;
 }
 
@@ -232,25 +428,32 @@ static int spufs_cntl_mmap(struct file *file, struct vm_area_struct *vma)
 #define spufs_cntl_mmap NULL
 #endif /* !SPUFS_MMAP_4K */
 
-static u64 spufs_cntl_get(void *data)
+static int spufs_cntl_get(void *data, u64 *val)
 {
 	struct spu_context *ctx = data;
-	u64 val;
+	int ret;
 
-	spu_acquire(ctx);
-	val = ctx->ops->status_read(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
+	*val = ctx->ops->status_read(ctx);
 	spu_release(ctx);
 
-	return val;
+	return 0;
 }
 
-static void spufs_cntl_set(void *data, u64 val)
+static int spufs_cntl_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
+	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->runcntl_write(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
 static int spufs_cntl_open(struct inode *inode, struct file *file)
@@ -263,7 +466,7 @@ static int spufs_cntl_open(struct inode *inode, struct file *file)
 	if (!i->i_openers++)
 		ctx->cntl = inode->i_mapping;
 	mutex_unlock(&ctx->mapping_lock);
-	return simple_attr_open(inode, file, spufs_cntl_get,
+	return spufs_attr_open(inode, file, spufs_cntl_get,
 					spufs_cntl_set, "0x%08lx");
 }
 
@@ -273,7 +476,7 @@ spufs_cntl_release(struct inode *inode, struct file *file)
 	struct spufs_inode_info *i = SPUFS_I(inode);
 	struct spu_context *ctx = i->i_ctx;
 
-	simple_attr_close(inode, file);
+	spufs_attr_release(inode, file);
 
 	mutex_lock(&ctx->mapping_lock);
 	if (!--i->i_openers)
@@ -285,8 +488,8 @@ spufs_cntl_release(struct inode *inode, struct file *file)
 static const struct file_operations spufs_cntl_fops = {
 	.open = spufs_cntl_open,
 	.release = spufs_cntl_release,
-	.read = simple_attr_read,
-	.write = simple_attr_write,
+	.read = spufs_attr_read,
+	.write = spufs_attr_write,
 	.mmap = spufs_cntl_mmap,
 };
 
@@ -314,7 +517,9 @@ spufs_regs_read(struct file *file, char __user *buffer,
 	int ret;
 	struct spu_context *ctx = file->private_data;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_regs_read(ctx, buffer, size, pos);
 	spu_release_saved(ctx);
 	return ret;
@@ -333,7 +538,9 @@ spufs_regs_write(struct file *file, const char __user *buffer,
 		return -EFBIG;
 	*pos += size;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 
 	ret = copy_from_user(lscsa->gprs + *pos - size,
 			     buffer, size) ? -EFAULT : size;
@@ -365,7 +572,9 @@ spufs_fpcr_read(struct file *file, char __user * buffer,
 	int ret;
 	struct spu_context *ctx = file->private_data;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_fpcr_read(ctx, buffer, size, pos);
 	spu_release_saved(ctx);
 	return ret;
@@ -382,10 +591,12 @@ spufs_fpcr_write(struct file *file, const char __user * buffer,
 	size = min_t(ssize_t, sizeof(lscsa->fpcr) - *pos, size);
 	if (size <= 0)
 		return -EFBIG;
-	*pos += size;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 
+	*pos += size;
 	ret = copy_from_user((char *)&lscsa->fpcr + *pos - size,
 			     buffer, size) ? -EFAULT : size;
 
@@ -432,7 +643,10 @@ static ssize_t spufs_mbox_read(struct file *file, char __user *buf,
 
 	udata = (void __user *)buf;
 
-	spu_acquire(ctx);
+	count = spu_acquire(ctx);
+	if (count)
+		return count;
+
 	for (count = 0; (count + 4) <= len; count += 4, udata++) {
 		int ret;
 		ret = ctx->ops->mbox_read(ctx, &mbox_data);
@@ -468,12 +682,15 @@ static ssize_t spufs_mbox_stat_read(struct file *file, char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx = file->private_data;
+	ssize_t ret;
 	u32 mbox_stat;
 
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 
 	mbox_stat = ctx->ops->mbox_stat_read(ctx) & 0xff;
 
@@ -508,6 +725,9 @@ void spufs_ibox_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
+	if (!ctx)
+		return;
+
 	wake_up_all(&ctx->ibox_wq);
 	kill_fasync(&ctx->ibox_fasync, SIGIO, POLLIN);
 }
@@ -539,23 +759,27 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf,
 
 	udata = (void __user *)buf;
 
-	spu_acquire(ctx);
+	count = spu_acquire(ctx);
+	if (count)
+		goto out;
 
 	/* wait only for the first element */
 	count = 0;
 	if (file->f_flags & O_NONBLOCK) {
-		if (!spu_ibox_read(ctx, &ibox_data))
+		if (!spu_ibox_read(ctx, &ibox_data)) {
 			count = -EAGAIN;
+			goto out_unlock;
+		}
 	} else {
 		count = spufs_wait(ctx->ibox_wq, spu_ibox_read(ctx, &ibox_data));
+		if (count)
+			goto out;
 	}
-	if (count)
-		goto out;
 
 	/* if we can't write at all, return -EFAULT */
 	count = __put_user(ibox_data, udata);
 	if (count)
-		goto out;
+		goto out_unlock;
 
 	for (count = 4, udata++; (count + 4) <= len; count += 4, udata++) {
 		int ret;
@@ -572,9 +796,9 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf,
 			break;
 	}
 
-out:
+out_unlock:
 	spu_release(ctx);
-
+out:
 	return count;
 }
 
@@ -585,7 +809,11 @@ static unsigned int spufs_ibox_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &ctx->ibox_wq, wait);
 
-	spu_acquire(ctx);
+	/*
+	 * For now keep this uninterruptible and also ignore the rule
+	 * that poll should not sleep.  Will be fixed later.
+	 */
+	mutex_lock(&ctx->state_mutex);
 	mask = ctx->ops->mbox_stat_poll(ctx, POLLIN | POLLRDNORM);
 	spu_release(ctx);
 
@@ -603,12 +831,15 @@ static ssize_t spufs_ibox_stat_read(struct file *file, char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx = file->private_data;
+	ssize_t ret;
 	u32 ibox_stat;
 
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ibox_stat = (ctx->ops->mbox_stat_read(ctx) >> 16) & 0xff;
 	spu_release(ctx);
 
@@ -644,6 +875,9 @@ void spufs_wbox_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
+	if (!ctx)
+		return;
+
 	wake_up_all(&ctx->wbox_wq);
 	kill_fasync(&ctx->wbox_fasync, SIGIO, POLLOUT);
 }
@@ -677,7 +911,9 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf,
 	if (__get_user(wbox_data, udata))
 		return -EFAULT;
 
-	spu_acquire(ctx);
+	count = spu_acquire(ctx);
+	if (count)
+		goto out;
 
 	/*
 	 * make sure we can at least write one element, by waiting
@@ -685,16 +921,18 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf,
 	 */
 	count = 0;
 	if (file->f_flags & O_NONBLOCK) {
-		if (!spu_wbox_write(ctx, wbox_data))
+		if (!spu_wbox_write(ctx, wbox_data)) {
 			count = -EAGAIN;
+			goto out_unlock;
+		}
 	} else {
 		count = spufs_wait(ctx->wbox_wq, spu_wbox_write(ctx, wbox_data));
+		if (count)
+			goto out;
 	}
 
-	if (count)
-		goto out;
 
-	/* write aѕ much as possible */
+	/* write as much as possible */
 	for (count = 4, udata++; (count + 4) <= len; count += 4, udata++) {
 		int ret;
 		ret = __get_user(wbox_data, udata);
@@ -706,8 +944,9 @@ static ssize_t spufs_wbox_write(struct file *file, const char __user *buf,
 			break;
 	}
 
-out:
+out_unlock:
 	spu_release(ctx);
+out:
 	return count;
 }
 
@@ -718,7 +957,11 @@ static unsigned int spufs_wbox_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &ctx->wbox_wq, wait);
 
-	spu_acquire(ctx);
+	/*
+	 * For now keep this uninterruptible and also ignore the rule
+	 * that poll should not sleep.  Will be fixed later.
+	 */
+	mutex_lock(&ctx->state_mutex);
 	mask = ctx->ops->mbox_stat_poll(ctx, POLLOUT | POLLWRNORM);
 	spu_release(ctx);
 
@@ -736,12 +979,15 @@ static ssize_t spufs_wbox_stat_read(struct file *file, char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx = file->private_data;
+	ssize_t ret;
 	u32 wbox_stat;
 
 	if (len < 4)
 		return -EINVAL;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	wbox_stat = (ctx->ops->mbox_stat_read(ctx) >> 8) & 0xff;
 	spu_release(ctx);
 
@@ -812,7 +1058,9 @@ static ssize_t spufs_signal1_read(struct file *file, char __user *buf,
 	int ret;
 	struct spu_context *ctx = file->private_data;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_signal1_read(ctx, buf, len, pos);
 	spu_release_saved(ctx);
 
@@ -823,6 +1071,7 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx;
+	ssize_t ret;
 	u32 data;
 
 	ctx = file->private_data;
@@ -833,7 +1082,9 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf,
 	if (copy_from_user(&data, buf, 4))
 		return -EFAULT;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal1_write(ctx, data);
 	spu_release(ctx);
 
@@ -943,7 +1194,9 @@ static ssize_t spufs_signal2_read(struct file *file, char __user *buf,
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	ret = __spufs_signal2_read(ctx, buf, len, pos);
 	spu_release_saved(ctx);
 
@@ -954,6 +1207,7 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
 			size_t len, loff_t *pos)
 {
 	struct spu_context *ctx;
+	ssize_t ret;
 	u32 data;
 
 	ctx = file->private_data;
@@ -964,7 +1218,9 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf,
 	if (copy_from_user(&data, buf, 4))
 		return -EFAULT;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal2_write(ctx, data);
 	spu_release(ctx);
 
@@ -1032,33 +1288,42 @@ static const struct file_operations spufs_signal2_nosched_fops = {
 #define SPU_ATTR_ACQUIRE_SAVED	2
 
 #define DEFINE_SPUFS_ATTRIBUTE(__name, __get, __set, __fmt, __acquire)	\
-static u64 __##__get(void *data)					\
+static int __##__get(void *data, u64 *val)				\
 {									\
 	struct spu_context *ctx = data;					\
-	u64 ret;							\
+	int ret = 0;							\
 									\
 	if (__acquire == SPU_ATTR_ACQUIRE) {				\
-		spu_acquire(ctx);					\
-		ret = __get(ctx);					\
+		ret = spu_acquire(ctx);					\
+		if (ret)						\
+			return ret;					\
+		*val = __get(ctx);					\
 		spu_release(ctx);					\
 	} else if (__acquire == SPU_ATTR_ACQUIRE_SAVED)	{		\
-		spu_acquire_saved(ctx);					\
-		ret = __get(ctx);					\
+		ret = spu_acquire_saved(ctx);				\
+		if (ret)						\
+			return ret;					\
+		*val = __get(ctx);					\
 		spu_release_saved(ctx);					\
 	} else								\
-		ret = __get(ctx);					\
+		*val = __get(ctx);					\
 									\
-	return ret;							\
+	return 0;							\
 }									\
-DEFINE_SIMPLE_ATTRIBUTE(__name, __##__get, __set, __fmt);
+DEFINE_SPUFS_SIMPLE_ATTRIBUTE(__name, __##__get, __set, __fmt);
 
-static void spufs_signal1_type_set(void *data, u64 val)
+static int spufs_signal1_type_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
+	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal1_type_set(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
 static u64 spufs_signal1_type_get(struct spu_context *ctx)
@@ -1069,13 +1334,18 @@ DEFINE_SPUFS_ATTRIBUTE(spufs_signal1_type, spufs_signal1_type_get,
 		       spufs_signal1_type_set, "%llu", SPU_ATTR_ACQUIRE);
 
 
-static void spufs_signal2_type_set(void *data, u64 val)
+static int spufs_signal2_type_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
+	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->signal2_type_set(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
 static u64 spufs_signal2_type_get(struct spu_context *ctx)
@@ -1275,6 +1545,9 @@ void spufs_mfc_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
+	if (!ctx)
+		return;
+
 	wake_up_all(&ctx->mfc_wq);
 
 	pr_debug("%s %s\n", __FUNCTION__, spu->name);
@@ -1321,22 +1594,26 @@ static ssize_t spufs_mfc_read(struct file *file, char __user *buffer,
 	if (size != 4)
 		goto out;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
 	if (file->f_flags & O_NONBLOCK) {
 		status = ctx->ops->read_mfc_tagstatus(ctx);
 		if (!(status & ctx->tagwait))
 			ret = -EAGAIN;
 		else
+			/* XXX(hch): shouldn't we clear ret here? */
 			ctx->tagwait &= ~status;
 	} else {
 		ret = spufs_wait(ctx->mfc_wq,
 			   spufs_read_mfc_tagstatus(ctx, &status));
+		if (ret)
+			goto out;
 	}
 	spu_release(ctx);
 
-	if (ret)
-		goto out;
-
 	ret = 4;
 	if (copy_to_user(buffer, &status, 4))
 		ret = -EFAULT;
@@ -1451,7 +1728,11 @@ static ssize_t spufs_mfc_write(struct file *file, const char __user *buffer,
 	if (ret)
 		goto out;
 
-	ret = spu_acquire_runnable(ctx, 0);
+	ret = spu_acquire(ctx);
+	if (ret)
+		goto out;
+
+	ret = spufs_wait(ctx->run_wq, ctx->state == SPU_STATE_RUNNABLE);
 	if (ret)
 		goto out;
 
@@ -1461,6 +1742,8 @@ static ssize_t spufs_mfc_write(struct file *file, const char __user *buffer,
 		int status;
 		ret = spufs_wait(ctx->mfc_wq,
 				 spu_send_mfc_command(ctx, cmd, &status));
+		if (ret)
+			goto out;
 		if (status)
 			ret = status;
 	}
@@ -1485,7 +1768,11 @@ static unsigned int spufs_mfc_poll(struct file *file,poll_table *wait)
 
 	poll_wait(file, &ctx->mfc_wq, wait);
 
-	spu_acquire(ctx);
+	/*
+	 * For now keep this uninterruptible and also ignore the rule
+	 * that poll should not sleep.  Will be fixed later.
+	 */
+	mutex_lock(&ctx->state_mutex);
 	ctx->ops->set_mfc_query(ctx, ctx->tagwait, 2);
 	free_elements = ctx->ops->get_mfc_free_elements(ctx);
 	tagstatus = ctx->ops->read_mfc_tagstatus(ctx);
@@ -1508,7 +1795,9 @@ static int spufs_mfc_flush(struct file *file, fl_owner_t id)
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	spu_acquire(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		goto out;
 #if 0
 /* this currently hangs */
 	ret = spufs_wait(ctx->mfc_wq,
@@ -1517,12 +1806,13 @@ static int spufs_mfc_flush(struct file *file, fl_owner_t id)
 		goto out;
 	ret = spufs_wait(ctx->mfc_wq,
 			 ctx->ops->read_mfc_tagstatus(ctx) == ctx->tagwait);
-out:
+	if (ret)
+		goto out;
 #else
 	ret = 0;
 #endif
 	spu_release(ctx);
-
+out:
 	return ret;
 }
 
@@ -1551,12 +1841,18 @@ static const struct file_operations spufs_mfc_fops = {
 	.mmap	 = spufs_mfc_mmap,
 };
 
-static void spufs_npc_set(void *data, u64 val)
+static int spufs_npc_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
-	spu_acquire(ctx);
+	int ret;
+
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 	ctx->ops->npc_write(ctx, val);
 	spu_release(ctx);
+
+	return 0;
 }
 
 static u64 spufs_npc_get(struct spu_context *ctx)
@@ -1566,13 +1862,19 @@ static u64 spufs_npc_get(struct spu_context *ctx)
 DEFINE_SPUFS_ATTRIBUTE(spufs_npc_ops, spufs_npc_get, spufs_npc_set,
 		       "0x%llx\n", SPU_ATTR_ACQUIRE);
 
-static void spufs_decr_set(void *data, u64 val)
+static int spufs_decr_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	lscsa->decr.slot[0] = (u32) val;
 	spu_release_saved(ctx);
+
+	return 0;
 }
 
 static u64 spufs_decr_get(struct spu_context *ctx)
@@ -1583,15 +1885,21 @@ static u64 spufs_decr_get(struct spu_context *ctx)
 DEFINE_SPUFS_ATTRIBUTE(spufs_decr_ops, spufs_decr_get, spufs_decr_set,
 		       "0x%llx\n", SPU_ATTR_ACQUIRE_SAVED);
 
-static void spufs_decr_status_set(void *data, u64 val)
+static int spufs_decr_status_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	if (val)
 		ctx->csa.priv2.mfc_control_RW |= MFC_CNTL_DECREMENTER_RUNNING;
 	else
 		ctx->csa.priv2.mfc_control_RW &= ~MFC_CNTL_DECREMENTER_RUNNING;
 	spu_release_saved(ctx);
+
+	return 0;
 }
 
 static u64 spufs_decr_status_get(struct spu_context *ctx)
@@ -1605,13 +1913,19 @@ DEFINE_SPUFS_ATTRIBUTE(spufs_decr_status_ops, spufs_decr_status_get,
 		       spufs_decr_status_set, "0x%llx\n",
 		       SPU_ATTR_ACQUIRE_SAVED);
 
-static void spufs_event_mask_set(void *data, u64 val)
+static int spufs_event_mask_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	lscsa->event_mask.slot[0] = (u32) val;
 	spu_release_saved(ctx);
+
+	return 0;
 }
 
 static u64 spufs_event_mask_get(struct spu_context *ctx)
@@ -1636,13 +1950,19 @@ static u64 spufs_event_status_get(struct spu_context *ctx)
 DEFINE_SPUFS_ATTRIBUTE(spufs_event_status_ops, spufs_event_status_get,
 		       NULL, "0x%llx\n", SPU_ATTR_ACQUIRE_SAVED)
 
-static void spufs_srr0_set(void *data, u64 val)
+static int spufs_srr0_set(void *data, u64 val)
 {
 	struct spu_context *ctx = data;
 	struct spu_lscsa *lscsa = ctx->csa.lscsa;
-	spu_acquire_saved(ctx);
+	int ret;
+
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	lscsa->srr0.slot[0] = (u32) val;
 	spu_release_saved(ctx);
+
+	return 0;
 }
 
 static u64 spufs_srr0_get(struct spu_context *ctx)
@@ -1673,10 +1993,12 @@ static u64 spufs_object_id_get(struct spu_context *ctx)
 	return ctx->object_id;
 }
 
-static void spufs_object_id_set(void *data, u64 id)
+static int spufs_object_id_set(void *data, u64 id)
 {
 	struct spu_context *ctx = data;
 	ctx->object_id = id;
+
+	return 0;
 }
 
 DEFINE_SPUFS_ATTRIBUTE(spufs_object_id_ops, spufs_object_id_get,
@@ -1723,13 +2045,13 @@ static const struct file_operations spufs_caps_fops = {
 static ssize_t __spufs_mbox_info_read(struct spu_context *ctx,
 			char __user *buf, size_t len, loff_t *pos)
 {
-	u32 mbox_stat;
 	u32 data;
 
-	mbox_stat = ctx->csa.prob.mb_stat_R;
-	if (mbox_stat & 0x0000ff) {
-		data = ctx->csa.prob.pu_mb_R;
-	}
+	/* EOF if there's no entry in the mbox */
+	if (!(ctx->csa.prob.mb_stat_R & 0x0000ff))
+		return 0;
+
+	data = ctx->csa.prob.pu_mb_R;
 
 	return simple_read_from_buffer(buf, len, pos, &data, sizeof data);
 }
@@ -1743,7 +2065,9 @@ static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_mbox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
@@ -1761,13 +2085,13 @@ static const struct file_operations spufs_mbox_info_fops = {
 static ssize_t __spufs_ibox_info_read(struct spu_context *ctx,
 				char __user *buf, size_t len, loff_t *pos)
 {
-	u32 ibox_stat;
 	u32 data;
 
-	ibox_stat = ctx->csa.prob.mb_stat_R;
-	if (ibox_stat & 0xff0000) {
-		data = ctx->csa.priv2.puint_mb_R;
-	}
+	/* EOF if there's no entry in the ibox */
+	if (!(ctx->csa.prob.mb_stat_R & 0xff0000))
+		return 0;
+
+	data = ctx->csa.priv2.puint_mb_R;
 
 	return simple_read_from_buffer(buf, len, pos, &data, sizeof data);
 }
@@ -1781,7 +2105,9 @@ static ssize_t spufs_ibox_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_ibox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
@@ -1822,7 +2148,9 @@ static ssize_t spufs_wbox_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_wbox_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
@@ -1872,7 +2200,9 @@ static ssize_t spufs_dma_info_read(struct file *file, char __user *buf,
 	if (!access_ok(VERIFY_WRITE, buf, len))
 		return -EFAULT;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_dma_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
@@ -1923,7 +2253,9 @@ static ssize_t spufs_proxydma_info_read(struct file *file, char __user *buf,
 	struct spu_context *ctx = file->private_data;
 	int ret;
 
-	spu_acquire_saved(ctx);
+	ret = spu_acquire_saved(ctx);
+	if (ret)
+		return ret;
 	spin_lock(&ctx->csa.register_lock);
 	ret = __spufs_proxydma_info_read(ctx, buf, len, pos);
 	spin_unlock(&ctx->csa.register_lock);
@@ -2012,8 +2344,12 @@ static unsigned long long spufs_class2_intrs(struct spu_context *ctx)
 static int spufs_show_stat(struct seq_file *s, void *private)
 {
 	struct spu_context *ctx = s->private;
+	int ret;
+
+	ret = spu_acquire(ctx);
+	if (ret)
+		return ret;
 
-	spu_acquire(ctx);
 	seq_printf(s, "%s %llu %llu %llu %llu "
 		      "%llu %llu %llu %llu %llu %llu %llu %llu\n",
 		ctx_state_names[ctx->stats.util_state],
@@ -2093,8 +2429,8 @@ struct tree_descr spufs_dir_nosched_contents[] = {
 	{ "mbox_stat", &spufs_mbox_stat_fops, 0444, },
 	{ "ibox_stat", &spufs_ibox_stat_fops, 0444, },
 	{ "wbox_stat", &spufs_wbox_stat_fops, 0444, },
-	{ "signal1", &spufs_signal1_nosched_fops, 0222, },
-	{ "signal2", &spufs_signal2_nosched_fops, 0222, },
+	{ "signal1", &spufs_signal1_fops, 0666, },
+	{ "signal2", &spufs_signal2_fops, 0666, },
 	{ "signal1_type", &spufs_signal1_type, 0666, },
 	{ "signal2_type", &spufs_signal2_type, 0666, },
 	{ "mss", &spufs_mss_fops, 0666, },
diff --git a/arch/powerpc/platforms/cell/spufs/hw_ops.c b/arch/powerpc/platforms/cell/spufs/hw_ops.c
index 428875c..1cd587b 100644
--- a/arch/powerpc/platforms/cell/spufs/hw_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/hw_ops.c
@@ -77,16 +77,18 @@ static unsigned int spu_hw_mbox_stat_poll(struct spu_context *ctx,
 		if (stat & 0xff0000)
 			ret |= POLLIN | POLLRDNORM;
 		else {
-			spu_int_stat_clear(spu, 2, 0x1);
-			spu_int_mask_or(spu, 2, 0x1);
+			spu_int_stat_clear(spu, 2, CLASS2_MAILBOX_INTR);
+			spu_int_mask_or(spu, 2, CLASS2_ENABLE_MAILBOX_INTR);
 		}
 	}
 	if (events & (POLLOUT | POLLWRNORM)) {
 		if (stat & 0x00ff00)
 			ret = POLLOUT | POLLWRNORM;
 		else {
-			spu_int_stat_clear(spu, 2, 0x10);
-			spu_int_mask_or(spu, 2, 0x10);
+			spu_int_stat_clear(spu, 2,
+					CLASS2_MAILBOX_THRESHOLD_INTR);
+			spu_int_mask_or(spu, 2,
+					CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR);
 		}
 	}
 	spin_unlock_irq(&spu->register_lock);
@@ -107,7 +109,7 @@ static int spu_hw_ibox_read(struct spu_context *ctx, u32 * data)
 		ret = 4;
 	} else {
 		/* make sure we get woken up by the interrupt */
-		spu_int_mask_or(spu, 2, 0x1);
+		spu_int_mask_or(spu, 2, CLASS2_ENABLE_MAILBOX_INTR);
 		ret = 0;
 	}
 	spin_unlock_irq(&spu->register_lock);
@@ -128,7 +130,7 @@ static int spu_hw_wbox_write(struct spu_context *ctx, u32 data)
 	} else {
 		/* make sure we get woken up by the interrupt when space
 		   becomes available */
-		spu_int_mask_or(spu, 2, 0x10);
+		spu_int_mask_or(spu, 2, CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR);
 		ret = 0;
 	}
 	spin_unlock_irq(&spu->register_lock);
@@ -207,6 +209,11 @@ static char *spu_hw_get_ls(struct spu_context *ctx)
 	return ctx->spu->local_store;
 }
 
+static void spu_hw_privcntl_write(struct spu_context *ctx, u64 val)
+{
+	out_be64(&ctx->spu->priv2->spu_privcntl_RW, val);
+}
+
 static u32 spu_hw_runcntl_read(struct spu_context *ctx)
 {
 	return in_be32(&ctx->spu->problem->spu_runcntl_RW);
@@ -216,11 +223,21 @@ static void spu_hw_runcntl_write(struct spu_context *ctx, u32 val)
 {
 	spin_lock_irq(&ctx->spu->register_lock);
 	if (val & SPU_RUNCNTL_ISOLATE)
-		out_be64(&ctx->spu->priv2->spu_privcntl_RW, 4LL);
+		spu_hw_privcntl_write(ctx,
+			SPU_PRIVCNT_LOAD_REQUEST_ENABLE_MASK);
 	out_be32(&ctx->spu->problem->spu_runcntl_RW, val);
 	spin_unlock_irq(&ctx->spu->register_lock);
 }
 
+static void spu_hw_runcntl_stop(struct spu_context *ctx)
+{
+	spin_lock_irq(&ctx->spu->register_lock);
+	out_be32(&ctx->spu->problem->spu_runcntl_RW, SPU_RUNCNTL_STOP);
+	while (in_be32(&ctx->spu->problem->spu_status_R) & SPU_STATUS_RUNNING)
+		cpu_relax();
+	spin_unlock_irq(&ctx->spu->register_lock);
+}
+
 static void spu_hw_master_start(struct spu_context *ctx)
 {
 	struct spu *spu = ctx->spu;
@@ -320,8 +337,10 @@ struct spu_context_ops spu_hw_ops = {
 	.npc_write = spu_hw_npc_write,
 	.status_read = spu_hw_status_read,
 	.get_ls = spu_hw_get_ls,
+	.privcntl_write = spu_hw_privcntl_write,
 	.runcntl_read = spu_hw_runcntl_read,
 	.runcntl_write = spu_hw_runcntl_write,
+	.runcntl_stop = spu_hw_runcntl_stop,
 	.master_start = spu_hw_master_start,
 	.master_stop = spu_hw_master_stop,
 	.set_mfc_query = spu_hw_set_mfc_query,
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index d9d211f..857272d 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -322,7 +322,7 @@ static struct spu_context *
 spufs_assert_affinity(unsigned int flags, struct spu_gang *gang,
 						struct file *filp)
 {
-	struct spu_context *tmp, *neighbor;
+	struct spu_context *tmp, *neighbor, *err;
 	int count, node;
 	int aff_supp;
 
@@ -354,11 +354,15 @@ spufs_assert_affinity(unsigned int flags, struct spu_gang *gang,
 		if (!list_empty(&neighbor->aff_list) && !(neighbor->aff_head) &&
 		    !list_is_last(&neighbor->aff_list, &gang->aff_list_head) &&
 		    !list_entry(neighbor->aff_list.next, struct spu_context,
-		    aff_list)->aff_head)
-			return ERR_PTR(-EEXIST);
+		    aff_list)->aff_head) {
+			err = ERR_PTR(-EEXIST);
+			goto out_put_neighbor;
+		}
 
-		if (gang != neighbor->gang)
-			return ERR_PTR(-EINVAL);
+		if (gang != neighbor->gang) {
+			err = ERR_PTR(-EINVAL);
+			goto out_put_neighbor;
+		}
 
 		count = 1;
 		list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
@@ -372,11 +376,17 @@ spufs_assert_affinity(unsigned int flags, struct spu_gang *gang,
 				break;
 		}
 
-		if (node == MAX_NUMNODES)
-			return ERR_PTR(-EEXIST);
+		if (node == MAX_NUMNODES) {
+			err = ERR_PTR(-EEXIST);
+			goto out_put_neighbor;
+		}
 	}
 
 	return neighbor;
+
+out_put_neighbor:
+	put_spu_context(neighbor);
+	return err;
 }
 
 static void
@@ -454,9 +464,12 @@ spufs_create_context(struct inode *inode, struct dentry *dentry,
 	if (ret)
 		goto out_aff_unlock;
 
-	if (affinity)
+	if (affinity) {
 		spufs_set_affinity(flags, SPUFS_I(dentry->d_inode)->i_ctx,
 								neighbor);
+		if (neighbor)
+			put_spu_context(neighbor);
+	}
 
 	/*
 	 * get references for dget and mntget, will be released
diff --git a/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c b/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c
index 55ac3a6..c2fafe1 100644
--- a/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c
+++ b/arch/powerpc/platforms/cell/spufs/lscsa_alloc.c
@@ -28,6 +28,10 @@
 #include <asm/spu_csa.h>
 #include <asm/mmu.h>
 
+#include "spufs.h"
+
+#include "spufs.h"
+
 static int spu_alloc_lscsa_std(struct spu_state *csa)
 {
 	struct spu_lscsa *lscsa;
@@ -60,6 +64,114 @@ static void spu_free_lscsa_std(struct spu_state *csa)
 	vfree(csa->lscsa);
 }
 
+#ifdef CONFIG_SPU_FS_64K_LS
+
+#define SPU_64K_PAGE_SHIFT	16
+#define SPU_64K_PAGE_ORDER	(SPU_64K_PAGE_SHIFT - PAGE_SHIFT)
+#define SPU_64K_PAGE_COUNT	(1ul << SPU_64K_PAGE_ORDER)
+
+int spu_alloc_lscsa(struct spu_state *csa)
+{
+	struct page	**pgarray;
+	unsigned char	*p;
+	int		i, j, n_4k;
+
+	/* Check availability of 64K pages */
+	if (!spu_64k_pages_available())
+		goto fail;
+
+	csa->use_big_pages = 1;
+
+	pr_debug("spu_alloc_lscsa(csa=0x%p), trying to allocate 64K pages\n",
+		 csa);
+
+	/* First try to allocate our 64K pages. We need 5 of them
+	 * with the current implementation. In the future, we should try
+	 * to separate the lscsa with the actual local store image, thus
+	 * allowing us to require only 4 64K pages per context
+	 */
+	for (i = 0; i < SPU_LSCSA_NUM_BIG_PAGES; i++) {
+		/* XXX This is likely to fail, we should use a special pool
+		 *     similiar to what hugetlbfs does.
+		 */
+		csa->lscsa_pages[i] = alloc_pages(GFP_KERNEL,
+						  SPU_64K_PAGE_ORDER);
+		if (csa->lscsa_pages[i] == NULL)
+			goto fail;
+	}
+
+	pr_debug(" success ! creating vmap...\n");
+
+	/* Now we need to create a vmalloc mapping of these for the kernel
+	 * and SPU context switch code to use. Currently, we stick to a
+	 * normal kernel vmalloc mapping, which in our case will be 4K
+	 */
+	n_4k = SPU_64K_PAGE_COUNT * SPU_LSCSA_NUM_BIG_PAGES;
+	pgarray = kmalloc(sizeof(struct page *) * n_4k, GFP_KERNEL);
+	if (pgarray == NULL)
+		goto fail;
+	for (i = 0; i < SPU_LSCSA_NUM_BIG_PAGES; i++)
+		for (j = 0; j < SPU_64K_PAGE_COUNT; j++)
+			/* We assume all the struct page's are contiguous
+			 * which should be hopefully the case for an order 4
+			 * allocation..
+			 */
+			pgarray[i * SPU_64K_PAGE_COUNT + j] =
+				csa->lscsa_pages[i] + j;
+	csa->lscsa = vmap(pgarray, n_4k, VM_USERMAP, PAGE_KERNEL);
+	kfree(pgarray);
+	if (csa->lscsa == NULL)
+		goto fail;
+
+	memset(csa->lscsa, 0, sizeof(struct spu_lscsa));
+
+	/* Set LS pages reserved to allow for user-space mapping.
+	 *
+	 * XXX isn't that a bit obsolete ? I think we should just
+	 * make sure the page count is high enough. Anyway, won't harm
+	 * for now
+	 */
+	for (p = csa->lscsa->ls; p < csa->lscsa->ls + LS_SIZE; p += PAGE_SIZE)
+		SetPageReserved(vmalloc_to_page(p));
+
+	pr_debug(" all good !\n");
+
+	return 0;
+fail:
+	pr_debug("spufs: failed to allocate lscsa 64K pages, falling back\n");
+	spu_free_lscsa(csa);
+	return spu_alloc_lscsa_std(csa);
+}
+
+void spu_free_lscsa(struct spu_state *csa)
+{
+	unsigned char *p;
+	int i;
+
+	if (!csa->use_big_pages) {
+		spu_free_lscsa_std(csa);
+		return;
+	}
+	csa->use_big_pages = 0;
+
+	if (csa->lscsa == NULL)
+		goto free_pages;
+
+	for (p = csa->lscsa->ls; p < csa->lscsa->ls + LS_SIZE; p += PAGE_SIZE)
+		ClearPageReserved(vmalloc_to_page(p));
+
+	vunmap(csa->lscsa);
+	csa->lscsa = NULL;
+
+ free_pages:
+
+	for (i = 0; i < SPU_LSCSA_NUM_BIG_PAGES; i++)
+		if (csa->lscsa_pages[i])
+			__free_pages(csa->lscsa_pages[i], SPU_64K_PAGE_ORDER);
+}
+
+#else /* CONFIG_SPU_FS_64K_LS */
+
 int spu_alloc_lscsa(struct spu_state *csa)
 {
 	return spu_alloc_lscsa_std(csa);
@@ -69,3 +181,5 @@ void spu_free_lscsa(struct spu_state *csa)
 {
 	spu_free_lscsa_std(csa);
 }
+
+#endif /* !defined(CONFIG_SPU_FS_64K_LS) */
diff --git a/arch/powerpc/platforms/cell/spufs/run.c b/arch/powerpc/platforms/cell/spufs/run.c
index d7c7113..cac69e1 100644
--- a/arch/powerpc/platforms/cell/spufs/run.c
+++ b/arch/powerpc/platforms/cell/spufs/run.c
@@ -15,24 +15,55 @@ void spufs_stop_callback(struct spu *spu)
 {
 	struct spu_context *ctx = spu->ctx;
 
-	wake_up_all(&ctx->stop_wq);
+	/*
+	 * It should be impossible to preempt a context while an exception
+	 * is being processed, since the context switch code is specially
+	 * coded to deal with interrupts ... But, just in case, sanity check
+	 * the context pointer.  It is OK to return doing nothing since
+	 * the exception will be regenerated when the context is resumed.
+	 */
+	if (ctx) {
+		/* Copy exception arguments into module specific structure */
+		ctx->csa.class_0_pending = spu->class_0_pending;
+		ctx->csa.dsisr = spu->dsisr;
+		ctx->csa.dar = spu->dar;
+
+		/* ensure that the exception status has hit memory before a
+		 * thread waiting on the context's stop queue is woken */
+		smp_wmb();
+
+		wake_up_all(&ctx->stop_wq);
+	}
+
+	/* Clear callback arguments from spu structure */
+	spu->class_0_pending = 0;
+	spu->dsisr = 0;
+	spu->dar = 0;
 }
 
-static inline int spu_stopped(struct spu_context *ctx, u32 *stat)
+int spu_stopped(struct spu_context *ctx, u32 *stat)
 {
-	struct spu *spu;
-	u64 pte_fault;
+	u64 dsisr;
+	u32 stopped;
 
 	*stat = ctx->ops->status_read(ctx);
 
-	spu = ctx->spu;
-	if (ctx->state != SPU_STATE_RUNNABLE ||
-	    test_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags))
+	if (test_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags))
+		return 1;
+
+	stopped = SPU_STATUS_INVALID_INSTR | SPU_STATUS_SINGLE_STEP |
+		SPU_STATUS_STOPPED_BY_HALT | SPU_STATUS_STOPPED_BY_STOP;
+	if (!(*stat & SPU_STATUS_RUNNING) && (*stat & stopped))
+		return 1;
+
+	dsisr = ctx->csa.dsisr;
+	if (dsisr & (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED))
 		return 1;
-	pte_fault = spu->dsisr &
-	    (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED);
-	return (!(*stat & SPU_STATUS_RUNNING) || pte_fault || spu->class_0_pending_value) ?
-		1 : 0;
+
+	if (ctx->csa.class_0_pending)
+		return 1;
+
+	return 0;
 }
 
 static int spu_setup_isolated(struct spu_context *ctx)
@@ -128,35 +159,68 @@ out:
 
 static int spu_run_init(struct spu_context *ctx, u32 *npc)
 {
+	unsigned long runcntl = SPU_RUNCNTL_RUNNABLE;
+	int ret;
+
 	spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 
-	if (ctx->flags & SPU_CREATE_ISOLATE) {
-		unsigned long runcntl;
+	/*
+	 * NOSCHED is synchronous scheduling with respect to the caller.
+	 * The caller waits for the context to be loaded.
+	 */
+	if (ctx->flags & SPU_CREATE_NOSCHED) {
+		if (ctx->state == SPU_STATE_SAVED) {
+			ret = spu_activate(ctx, 0);
+			if (ret)
+				return ret;
+		}
+	}
 
+	/*
+	 * Apply special setup as required.
+	 */
+	if (ctx->flags & SPU_CREATE_ISOLATE) {
 		if (!(ctx->ops->status_read(ctx) & SPU_STATUS_ISOLATED_STATE)) {
-			int ret = spu_setup_isolated(ctx);
+			ret = spu_setup_isolated(ctx);
 			if (ret)
 				return ret;
 		}
 
-		/* if userspace has set the runcntrl register (eg, to issue an
-		 * isolated exit), we need to re-set it here */
+		/*
+		 * If userspace has set the runcntrl register (eg, to
+		 * issue an isolated exit), we need to re-set it here
+		 */
 		runcntl = ctx->ops->runcntl_read(ctx) &
 			(SPU_RUNCNTL_RUNNABLE | SPU_RUNCNTL_ISOLATE);
 		if (runcntl == 0)
 			runcntl = SPU_RUNCNTL_RUNNABLE;
+	}
+
+	if (ctx->flags & SPU_CREATE_NOSCHED) {
+		spuctx_switch_state(ctx, SPU_UTIL_USER);
 		ctx->ops->runcntl_write(ctx, runcntl);
 	} else {
-		unsigned long mode = SPU_PRIVCNTL_MODE_NORMAL;
-		ctx->ops->npc_write(ctx, *npc);
+		unsigned long privcntl;
+
 		if (test_thread_flag(TIF_SINGLESTEP))
-			mode = SPU_PRIVCNTL_MODE_SINGLE_STEP;
-		out_be64(&ctx->spu->priv2->spu_privcntl_RW, mode);
-		ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE);
-	}
+			privcntl = SPU_PRIVCNTL_MODE_SINGLE_STEP;
+		else
+			privcntl = SPU_PRIVCNTL_MODE_NORMAL;
+
+		ctx->ops->npc_write(ctx, *npc);
+		ctx->ops->privcntl_write(ctx, privcntl);
+		ctx->ops->runcntl_write(ctx, runcntl);
 
-	spuctx_switch_state(ctx, SPU_UTIL_USER);
+		if (ctx->state == SPU_STATE_SAVED) {
+			ret = spu_activate(ctx, 0);
+			if (ret)
+				return ret;
+		} else {
+			spuctx_switch_state(ctx, SPU_UTIL_USER);
+		}
+	}
 
+	set_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags);
 	return 0;
 }
 
@@ -165,10 +229,13 @@ static int spu_run_fini(struct spu_context *ctx, u32 *npc,
 {
 	int ret = 0;
 
+	spu_del_from_rq(ctx);
+
 	*status = ctx->ops->status_read(ctx);
 	*npc = ctx->ops->npc_read(ctx);
 
 	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
+	clear_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags);
 	spu_release(ctx);
 
 	if (signal_pending(current))
@@ -177,26 +244,6 @@ static int spu_run_fini(struct spu_context *ctx, u32 *npc,
 	return ret;
 }
 
-static int spu_reacquire_runnable(struct spu_context *ctx, u32 *npc,
-				         u32 *status)
-{
-	int ret;
-
-	ret = spu_run_fini(ctx, npc, status);
-	if (ret)
-		return ret;
-
-	if (*status & (SPU_STATUS_STOPPED_BY_STOP | SPU_STATUS_STOPPED_BY_HALT))
-		return *status;
-
-	ret = spu_acquire_runnable(ctx, 0);
-	if (ret)
-		return ret;
-
-	spuctx_switch_state(ctx, SPU_UTIL_USER);
-	return 0;
-}
-
 /*
  * SPU syscall restarting is tricky because we violate the basic
  * assumption that the signal handler is running on the interrupted
@@ -247,7 +294,7 @@ static int spu_process_callback(struct spu_context *ctx)
 	u32 ls_pointer, npc;
 	void __iomem *ls;
 	long spu_ret;
-	int ret;
+	int ret, ret2;
 
 	/* get syscall block from local store */
 	npc = ctx->ops->npc_read(ctx) & ~3;
@@ -269,9 +316,11 @@ static int spu_process_callback(struct spu_context *ctx)
 		if (spu_ret <= -ERESTARTSYS) {
 			ret = spu_handle_restartsys(ctx, &spu_ret, &npc);
 		}
-		spu_acquire(ctx);
+		ret2 = spu_acquire(ctx);
 		if (ret == -ERESTARTSYS)
 			return ret;
+		if (ret2)
+			return -EINTR;
 	}
 
 	/* write result, jump over indirect pointer */
@@ -281,18 +330,6 @@ static int spu_process_callback(struct spu_context *ctx)
 	return ret;
 }
 
-static inline int spu_process_events(struct spu_context *ctx)
-{
-	struct spu *spu = ctx->spu;
-	int ret = 0;
-
-	if (spu->class_0_pending_value)
-		ret = spu_irq_class_0_bottom(spu);
-	if (!ret && signal_pending(current))
-		ret = -ERESTARTSYS;
-	return ret;
-}
-
 long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 {
 	int ret;
@@ -302,29 +339,14 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 	if (mutex_lock_interruptible(&ctx->run_mutex))
 		return -ERESTARTSYS;
 
-	ctx->ops->master_start(ctx);
+	spu_enable_spu(ctx);
 	ctx->event_return = 0;
 
-	spu_acquire(ctx);
-	if (ctx->state == SPU_STATE_SAVED) {
-		__spu_update_sched_info(ctx);
-		spu_set_timeslice(ctx);
+	ret = spu_acquire(ctx);
+	if (ret)
+		goto out_unlock;
 
-		ret = spu_activate(ctx, 0);
-		if (ret) {
-			spu_release(ctx);
-			goto out;
-		}
-	} else {
-		/*
-		 * We have to update the scheduling priority under active_mutex
-		 * to protect against find_victim().
-		 *
-		 * No need to update the timeslice ASAP, it will get updated
-		 * once the current one has expired.
-		 */
-		spu_update_sched_info(ctx);
-	}
+	spu_update_sched_info(ctx);
 
 	ret = spu_run_init(ctx, npc);
 	if (ret) {
@@ -334,8 +356,15 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 
 	do {
 		ret = spufs_wait(ctx->stop_wq, spu_stopped(ctx, &status));
-		if (unlikely(ret))
+		if (unlikely(ret)) {
+			/*
+			 * This is nasty: we need the state_mutex for all the
+			 * bookkeeping even if the syscall was interrupted by
+			 * a signal. ewww.
+			 */
+			mutex_lock(&ctx->state_mutex);
 			break;
+		}
 		spu = ctx->spu;
 		if (unlikely(test_and_clear_bit(SPU_SCHED_NOTIFY_ACTIVE,
 						&ctx->sched_flags))) {
@@ -358,29 +387,24 @@ long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *event)
 		if (ret)
 			break;
 
-		if (unlikely(ctx->state != SPU_STATE_RUNNABLE)) {
-			ret = spu_reacquire_runnable(ctx, npc, &status);
-			if (ret)
-				goto out2;
-			continue;
-		}
-		ret = spu_process_events(ctx);
+		ret = spufs_handle_class0(ctx);
+		if (ret)
+			break;
 
+		if (signal_pending(current))
+			ret = -ERESTARTSYS;
 	} while (!ret && !(status & (SPU_STATUS_STOPPED_BY_STOP |
 				      SPU_STATUS_STOPPED_BY_HALT |
 				       SPU_STATUS_SINGLE_STEP)));
 
-	if ((status & SPU_STATUS_STOPPED_BY_STOP) &&
-	    (((status >> SPU_STOP_STATUS_SHIFT) & 0x3f00) == 0x2100) &&
-	    (ctx->state == SPU_STATE_RUNNABLE))
-		ctx->stats.libassist++;
-
-
-	ctx->ops->master_stop(ctx);
+	spu_disable_spu(ctx);
 	ret = spu_run_fini(ctx, npc, &status);
 	spu_yield(ctx);
 
-out2:
+	if ((status & SPU_STATUS_STOPPED_BY_STOP) &&
+	    (((status >> SPU_STOP_STATUS_SHIFT) & 0x3f00) == 0x2100))
+		ctx->stats.libassist++;
+
 	if ((ret == 0) ||
 	    ((ret == -ERESTARTSYS) &&
 	     ((status & SPU_STATUS_STOPPED_BY_HALT) ||
@@ -393,14 +417,18 @@ out2:
 	 * since we have TIF_SINGLESTEP set, thus the kernel will do
 	 * it upon return from the syscall anyawy
 	 */
-	if ((status & SPU_STATUS_STOPPED_BY_STOP)
-	    && (status >> SPU_STOP_STATUS_SHIFT) == 0x3fff) {
+	if (unlikely(status & SPU_STATUS_SINGLE_STEP))
+		ret = -ERESTARTSYS;
+
+	else if (unlikely((status & SPU_STATUS_STOPPED_BY_STOP)
+	    && (status >> SPU_STOP_STATUS_SHIFT) == 0x3fff)) {
 		force_sig(SIGTRAP, current);
 		ret = -ERESTARTSYS;
 	}
 
 out:
 	*event = ctx->event_return;
+out_unlock:
 	mutex_unlock(&ctx->run_mutex);
 	return ret;
 }
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 419bcd0..44374c1 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -57,6 +57,7 @@ static unsigned long spu_avenrun[3];
 static struct spu_prio_array *spu_prio;
 static struct task_struct *spusched_task;
 static struct timer_list spusched_timer;
+static struct timer_list spuloadavg_timer;
 
 /*
  * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
@@ -104,15 +105,21 @@ void spu_set_timeslice(struct spu_context *ctx)
 void __spu_update_sched_info(struct spu_context *ctx)
 {
 	/*
-	 * 32-Bit assignment are atomic on powerpc, and we don't care about
-	 * memory ordering here because retriving the controlling thread is
-	 * per defintion racy.
+	 * assert that the context is not on the runqueue, so it is safe
+	 * to change its scheduling parameters.
+	 */
+	BUG_ON(!list_empty(&ctx->rq));
+
+	/*
+	 * 32-Bit assignments are atomic on powerpc, and we don't care about
+	 * memory ordering here because retrieving the controlling thread is
+	 * per definition racy.
 	 */
 	ctx->tid = current->pid;
 
 	/*
 	 * We do our own priority calculations, so we normally want
-	 * ->static_prio to start with. Unfortunately thies field
+	 * ->static_prio to start with. Unfortunately this field
 	 * contains junk for threads with a realtime scheduling
 	 * policy so we have to look at ->prio in this case.
 	 */
@@ -123,23 +130,32 @@ void __spu_update_sched_info(struct spu_context *ctx)
 	ctx->policy = current->policy;
 
 	/*
-	 * A lot of places that don't hold list_mutex poke into
-	 * cpus_allowed, including grab_runnable_context which
-	 * already holds the runq_lock.  So abuse runq_lock
-	 * to protect this field aswell.
+	 * TO DO: the context may be loaded, so we may need to activate
+	 * it again on a different node. But it shouldn't hurt anything
+	 * to update its parameters, because we know that the scheduler
+	 * is not actively looking at this field, since it is not on the
+	 * runqueue. The context will be rescheduled on the proper node
+	 * if it is timesliced or preempted.
 	 */
-	spin_lock(&spu_prio->runq_lock);
 	ctx->cpus_allowed = current->cpus_allowed;
-	spin_unlock(&spu_prio->runq_lock);
 }
 
 void spu_update_sched_info(struct spu_context *ctx)
 {
-	int node = ctx->spu->node;
+	int node;
 
-	mutex_lock(&cbe_spu_info[node].list_mutex);
-	__spu_update_sched_info(ctx);
-	mutex_unlock(&cbe_spu_info[node].list_mutex);
+	if (ctx->state == SPU_STATE_RUNNABLE) {
+		node = ctx->spu->node;
+
+		/*
+		 * Take list_mutex to sync with find_victim().
+		 */
+		mutex_lock(&cbe_spu_info[node].list_mutex);
+		__spu_update_sched_info(ctx);
+		mutex_unlock(&cbe_spu_info[node].list_mutex);
+	} else {
+		__spu_update_sched_info(ctx);
+	}
 }
 
 static int __node_allowed(struct spu_context *ctx, int node)
@@ -173,7 +189,7 @@ void do_notify_spus_active(void)
 	 * Wake up the active spu_contexts.
 	 *
 	 * When the awakened processes see their "notify_active" flag is set,
-	 * they will call spu_switch_notify();
+	 * they will call spu_switch_notify().
 	 */
 	for_each_online_node(node) {
 		struct spu *spu;
@@ -191,14 +207,6 @@ void do_notify_spus_active(void)
 		mutex_unlock(&cbe_spu_info[node].list_mutex);
 	}
 }
-EXPORT_SYMBOL_GPL(do_notify_spus_active);
-
-#ifndef MODULE
-void notify_spus_active(void)
-{
-	do_notify_spus_active();
-}
-#endif
 
 /**
  * spu_bind_context - bind spu context to physical spu
@@ -228,7 +236,6 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 	spu->wbox_callback = spufs_wbox_callback;
 	spu->stop_callback = spufs_stop_callback;
 	spu->mfc_callback = spufs_mfc_callback;
-	spu->dma_callback = spufs_dma_callback;
 	mb();
 	spu_unmap_mappings(ctx);
 	spu_restore(&ctx->csa, spu);
@@ -237,7 +244,7 @@ static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 	spu_switch_notify(spu, ctx);
 	ctx->state = SPU_STATE_RUNNABLE;
 
-	spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
+	spuctx_switch_state(ctx, SPU_UTIL_USER);
 }
 
 /*
@@ -416,7 +423,6 @@ static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 	spu->wbox_callback = NULL;
 	spu->stop_callback = NULL;
 	spu->mfc_callback = NULL;
-	spu->dma_callback = NULL;
 	spu_associate_mm(spu, NULL);
 	spu->pid = 0;
 	spu->tgid = 0;
@@ -461,6 +467,13 @@ static void __spu_add_to_rq(struct spu_context *ctx)
 	}
 }
 
+static void spu_add_to_rq(struct spu_context *ctx)
+{
+	spin_lock(&spu_prio->runq_lock);
+	__spu_add_to_rq(ctx);
+	spin_unlock(&spu_prio->runq_lock);
+}
+
 static void __spu_del_from_rq(struct spu_context *ctx)
 {
 	int prio = ctx->prio;
@@ -475,10 +488,24 @@ static void __spu_del_from_rq(struct spu_context *ctx)
 	}
 }
 
+void spu_del_from_rq(struct spu_context *ctx)
+{
+	spin_lock(&spu_prio->runq_lock);
+	__spu_del_from_rq(ctx);
+	spin_unlock(&spu_prio->runq_lock);
+}
+
 static void spu_prio_wait(struct spu_context *ctx)
 {
 	DEFINE_WAIT(wait);
 
+	/*
+	 * The caller must explicitly wait for a context to be loaded
+	 * if the nosched flag is set.  If NOSCHED is not set, the caller
+	 * queues the context and waits for an spu event or error.
+	 */
+	BUG_ON(!(ctx->flags & SPU_CREATE_NOSCHED));
+
 	spin_lock(&spu_prio->runq_lock);
 	prepare_to_wait_exclusive(&ctx->stop_wq, &wait, TASK_INTERRUPTIBLE);
 	if (!signal_pending(current)) {
@@ -562,7 +589,7 @@ static struct spu *find_victim(struct spu_context *ctx)
 	/*
 	 * Look for a possible preemption candidate on the local node first.
 	 * If there is no candidate look at the other nodes.  This isn't
-	 * exactly fair, but so far the whole spu schedule tries to keep
+	 * exactly fair, but so far the whole spu scheduler tries to keep
 	 * a strong node affinity.  We might want to fine-tune this in
 	 * the future.
 	 */
@@ -578,6 +605,7 @@ static struct spu *find_victim(struct spu_context *ctx)
 			struct spu_context *tmp = spu->ctx;
 
 			if (tmp && tmp->prio > ctx->prio &&
+			    !(tmp->flags & SPU_CREATE_NOSCHED) &&
 			    (!victim || tmp->prio > victim->prio))
 				victim = spu->ctx;
 		}
@@ -589,6 +617,10 @@ static struct spu *find_victim(struct spu_context *ctx)
 			 * higher priority contexts before lower priority
 			 * ones, so this is safe until we introduce
 			 * priority inheritance schemes.
+			 *
+			 * XXX if the highest priority context is locked,
+			 * this can loop a long time.  Might be better to
+			 * look at another context or give up after X retries.
 			 */
 			if (!mutex_trylock(&victim->state_mutex)) {
 				victim = NULL;
@@ -596,10 +628,10 @@ static struct spu *find_victim(struct spu_context *ctx)
 			}
 
 			spu = victim->spu;
-			if (!spu) {
+			if (!spu || victim->prio <= ctx->prio) {
 				/*
 				 * This race can happen because we've dropped
-				 * the active list mutex.  No a problem, just
+				 * the active list mutex.  Not a problem, just
 				 * restart the search.
 				 */
 				mutex_unlock(&victim->state_mutex);
@@ -614,13 +646,10 @@ static struct spu *find_victim(struct spu_context *ctx)
 
 			victim->stats.invol_ctx_switch++;
 			spu->stats.invol_ctx_switch++;
+			spu_add_to_rq(victim);
+
 			mutex_unlock(&victim->state_mutex);
-			/*
-			 * We need to break out of the wait loop in spu_run
-			 * manually to ensure this context gets put on the
-			 * runqueue again ASAP.
-			 */
-			wake_up(&victim->stop_wq);
+
 			return spu;
 		}
 	}
@@ -628,6 +657,50 @@ static struct spu *find_victim(struct spu_context *ctx)
 	return NULL;
 }
 
+static void __spu_schedule(struct spu *spu, struct spu_context *ctx)
+{
+	int node = spu->node;
+	int success = 0;
+
+	spu_set_timeslice(ctx);
+
+	mutex_lock(&cbe_spu_info[node].list_mutex);
+	if (spu->ctx == NULL) {
+		spu_bind_context(spu, ctx);
+		cbe_spu_info[node].nr_active++;
+		spu->alloc_state = SPU_USED;
+		success = 1;
+	}
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
+
+	if (success)
+		wake_up_all(&ctx->run_wq);
+	else
+		spu_add_to_rq(ctx);
+}
+
+static void spu_schedule(struct spu *spu, struct spu_context *ctx)
+{
+	/* not a candidate for interruptible because it's called either
+	   from the scheduler thread or from spu_deactivate */
+	mutex_lock(&ctx->state_mutex);
+	__spu_schedule(spu, ctx);
+	spu_release(ctx);
+}
+
+static void spu_unschedule(struct spu *spu, struct spu_context *ctx)
+{
+	int node = spu->node;
+
+	mutex_lock(&cbe_spu_info[node].list_mutex);
+	cbe_spu_info[node].nr_active--;
+	spu->alloc_state = SPU_FREE;
+	spu_unbind_context(spu, ctx);
+	ctx->stats.invol_ctx_switch++;
+	spu->stats.invol_ctx_switch++;
+	mutex_unlock(&cbe_spu_info[node].list_mutex);
+}
+
 /**
  * spu_activate - find a free spu for a context and execute it
  * @ctx:	spu context to schedule
@@ -639,39 +712,47 @@ static struct spu *find_victim(struct spu_context *ctx)
  */
 int spu_activate(struct spu_context *ctx, unsigned long flags)
 {
-	do {
-		struct spu *spu;
+	struct spu *spu;
 
-		/*
-		 * If there are multiple threads waiting for a single context
-		 * only one actually binds the context while the others will
-		 * only be able to acquire the state_mutex once the context
-		 * already is in runnable state.
-		 */
-		if (ctx->spu)
-			return 0;
+	/*
+	 * If there are multiple threads waiting for a single context
+	 * only one actually binds the context while the others will
+	 * only be able to acquire the state_mutex once the context
+	 * already is in runnable state.
+	 */
+	if (ctx->spu)
+		return 0;
 
-		spu = spu_get_idle(ctx);
-		/*
-		 * If this is a realtime thread we try to get it running by
-		 * preempting a lower priority thread.
-		 */
-		if (!spu && rt_prio(ctx->prio))
-			spu = find_victim(ctx);
-		if (spu) {
-			int node = spu->node;
+spu_activate_top:
+	if (signal_pending(current))
+		return -ERESTARTSYS;
 
-			mutex_lock(&cbe_spu_info[node].list_mutex);
-			spu_bind_context(spu, ctx);
-			cbe_spu_info[node].nr_active++;
-			mutex_unlock(&cbe_spu_info[node].list_mutex);
-			return 0;
-		}
+	spu = spu_get_idle(ctx);
+	/*
+	 * If this is a realtime thread we try to get it running by
+	 * preempting a lower priority thread.
+	 */
+	if (!spu && rt_prio(ctx->prio))
+		spu = find_victim(ctx);
+	if (spu) {
+		unsigned long runcntl;
 
+		runcntl = ctx->ops->runcntl_read(ctx);
+		__spu_schedule(spu, ctx);
+		if (runcntl & SPU_RUNCNTL_RUNNABLE)
+			spuctx_switch_state(ctx, SPU_UTIL_USER);
+
+		return 0;
+	}
+
+	if (ctx->flags & SPU_CREATE_NOSCHED) {
 		spu_prio_wait(ctx);
-	} while (!signal_pending(current));
+		goto spu_activate_top;
+	}
+
+	spu_add_to_rq(ctx);
 
-	return -ERESTARTSYS;
+	return 0;
 }
 
 /**
@@ -713,21 +794,19 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
 	if (spu) {
 		new = grab_runnable_context(max_prio, spu->node);
 		if (new || force) {
-			int node = spu->node;
-
-			mutex_lock(&cbe_spu_info[node].list_mutex);
-			spu_unbind_context(spu, ctx);
-			spu->alloc_state = SPU_FREE;
-			cbe_spu_info[node].nr_active--;
-			mutex_unlock(&cbe_spu_info[node].list_mutex);
-
-			ctx->stats.vol_ctx_switch++;
-			spu->stats.vol_ctx_switch++;
-
-			if (new)
-				wake_up(&new->stop_wq);
+			spu_unschedule(spu, ctx);
+			if (new) {
+				if (new->flags & SPU_CREATE_NOSCHED)
+					wake_up(&new->stop_wq);
+				else {
+					spu_release(ctx);
+					spu_schedule(spu, new);
+					/* this one can't easily be made
+					   interruptible */
+					mutex_lock(&ctx->state_mutex);
+				}
+			}
 		}
-
 	}
 
 	return new != NULL;
@@ -764,43 +843,36 @@ void spu_yield(struct spu_context *ctx)
 
 static noinline void spusched_tick(struct spu_context *ctx)
 {
+	struct spu_context *new = NULL;
+	struct spu *spu = NULL;
+
+	if (spu_acquire(ctx))
+		BUG();	/* a kernel thread never has signals pending */
+
+	if (ctx->state != SPU_STATE_RUNNABLE)
+		goto out;
 	if (ctx->flags & SPU_CREATE_NOSCHED)
-		return;
+		goto out;
 	if (ctx->policy == SCHED_FIFO)
-		return;
+		goto out;
 
-	if (--ctx->time_slice)
-		return;
+	if (--ctx->time_slice && test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+		goto out;
 
-	/*
-	 * Unfortunately list_mutex ranks outside of state_mutex, so
-	 * we have to trylock here.  If we fail give the context another
-	 * tick and try again.
-	 */
-	if (mutex_trylock(&ctx->state_mutex)) {
-		struct spu *spu = ctx->spu;
-		struct spu_context *new;
-
-		new = grab_runnable_context(ctx->prio + 1, spu->node);
-		if (new) {
-			spu_unbind_context(spu, ctx);
-			ctx->stats.invol_ctx_switch++;
-			spu->stats.invol_ctx_switch++;
-			spu->alloc_state = SPU_FREE;
-			cbe_spu_info[spu->node].nr_active--;
-			wake_up(&new->stop_wq);
-			/*
-			 * We need to break out of the wait loop in
-			 * spu_run manually to ensure this context
-			 * gets put on the runqueue again ASAP.
-			 */
-			wake_up(&ctx->stop_wq);
-		}
-		spu_set_timeslice(ctx);
-		mutex_unlock(&ctx->state_mutex);
+	spu = ctx->spu;
+	new = grab_runnable_context(ctx->prio + 1, spu->node);
+	if (new) {
+		spu_unschedule(spu, ctx);
+		if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags))
+			spu_add_to_rq(ctx);
 	} else {
 		ctx->time_slice++;
 	}
+out:
+	spu_release(ctx);
+
+	if (new)
+		spu_schedule(spu, new);
 }
 
 /**
@@ -824,35 +896,31 @@ static unsigned long count_active_contexts(void)
 }
 
 /**
- * spu_calc_load - given tick count, update the avenrun load estimates.
- * @tick:	tick count
+ * spu_calc_load - update the avenrun load estimates.
  *
  * No locking against reading these values from userspace, as for
  * the CPU loadavg code.
  */
-static void spu_calc_load(unsigned long ticks)
+static void spu_calc_load(void)
 {
 	unsigned long active_tasks; /* fixed-point */
-	static int count = LOAD_FREQ;
-
-	count -= ticks;
-
-	if (unlikely(count < 0)) {
-		active_tasks = count_active_contexts() * FIXED_1;
-		do {
-			CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks);
-			CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks);
-			CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks);
-			count += LOAD_FREQ;
-		} while (count < 0);
-	}
+
+	active_tasks = count_active_contexts() * FIXED_1;
+	CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks);
+	CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks);
+	CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks);
 }
 
 static void spusched_wake(unsigned long data)
 {
 	mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 	wake_up_process(spusched_task);
-	spu_calc_load(SPUSCHED_TICK);
+}
+
+static void spuloadavg_wake(unsigned long data)
+{
+	mod_timer(&spuloadavg_timer, jiffies + LOAD_FREQ);
+	spu_calc_load();
 }
 
 static int spusched_thread(void *unused)
@@ -864,17 +932,58 @@ static int spusched_thread(void *unused)
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule();
 		for (node = 0; node < MAX_NUMNODES; node++) {
-			mutex_lock(&cbe_spu_info[node].list_mutex);
-			list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)
-				if (spu->ctx)
-					spusched_tick(spu->ctx);
-			mutex_unlock(&cbe_spu_info[node].list_mutex);
+			struct mutex *mtx = &cbe_spu_info[node].list_mutex;
+
+			mutex_lock(mtx);
+			list_for_each_entry(spu, &cbe_spu_info[node].spus,
+					cbe_list) {
+				struct spu_context *ctx = spu->ctx;
+
+				if (ctx) {
+					mutex_unlock(mtx);
+					spusched_tick(ctx);
+					mutex_lock(mtx);
+				}
+			}
+			mutex_unlock(mtx);
 		}
 	}
 
 	return 0;
 }
 
+void spuctx_switch_state(struct spu_context *ctx,
+		enum spu_utilization_state new_state)
+{
+	unsigned long long curtime;
+	signed long long delta;
+	struct timespec ts;
+	struct spu *spu;
+	enum spu_utilization_state old_state;
+
+	ktime_get_ts(&ts);
+	curtime = timespec_to_ns(&ts);
+	delta = curtime - ctx->stats.tstamp;
+
+	WARN_ON(!mutex_is_locked(&ctx->state_mutex));
+	WARN_ON(delta < 0);
+
+	spu = ctx->spu;
+	old_state = ctx->stats.util_state;
+	ctx->stats.util_state = new_state;
+	ctx->stats.tstamp = curtime;
+
+	/*
+	 * Update the physical SPU utilization statistics.
+	 */
+	if (spu) {
+		ctx->stats.times[old_state] += delta;
+		spu->stats.times[old_state] += delta;
+		spu->stats.util_state = new_state;
+		spu->stats.tstamp = curtime;
+	}
+}
+
 #define LOAD_INT(x) ((x) >> FSHIFT)
 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 
@@ -888,7 +997,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 
 	/*
 	 * Note that last_pid doesn't really make much sense for the
-	 * SPU loadavg (it even seems very odd on the CPU side..),
+	 * SPU loadavg (it even seems very odd on the CPU side...),
 	 * but we include it here to have a 100% compatible interface.
 	 */
 	seq_printf(s, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
@@ -929,6 +1038,7 @@ int __init spu_sched_init(void)
 	spin_lock_init(&spu_prio->runq_lock);
 
 	setup_timer(&spusched_timer, spusched_wake, 0);
+	setup_timer(&spuloadavg_timer, spuloadavg_wake, 0);
 
 	spusched_task = kthread_run(spusched_thread, NULL, "spusched");
 	if (IS_ERR(spusched_task)) {
@@ -936,6 +1046,8 @@ int __init spu_sched_init(void)
 		goto out_free_spu_prio;
 	}
 
+	mod_timer(&spuloadavg_timer, 0);
+
 	entry = create_proc_entry("spu_loadavg", 0, NULL);
 	if (!entry)
 		goto out_stop_kthread;
@@ -961,6 +1073,7 @@ void spu_sched_exit(void)
 	remove_proc_entry("spu_loadavg", NULL);
 
 	del_timer_sync(&spusched_timer);
+	del_timer_sync(&spuloadavg_timer);
 	kthread_stop(spusched_task);
 
 	for (node = 0; node < MAX_NUMNODES; node++) {
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h
index ca47b99..2bdb197 100644
--- a/arch/powerpc/platforms/cell/spufs/spufs.h
+++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -44,6 +44,7 @@ struct spu_gang;
 enum {
 	SPU_SCHED_NOTIFY_ACTIVE,
 	SPU_SCHED_WAS_ACTIVE,	/* was active upon spu_acquire_saved()  */
+	SPU_SCHED_SPU_RUN,	/* context is within spu_run */
 };
 
 struct spu_context {
@@ -71,6 +72,7 @@ struct spu_context {
 	wait_queue_head_t wbox_wq;
 	wait_queue_head_t stop_wq;
 	wait_queue_head_t mfc_wq;
+	wait_queue_head_t run_wq;
 	struct fasync_struct *ibox_fasync;
 	struct fasync_struct *wbox_fasync;
 	struct fasync_struct *mfc_fasync;
@@ -168,8 +170,10 @@ struct spu_context_ops {
 	void (*npc_write) (struct spu_context * ctx, u32 data);
 	 u32(*status_read) (struct spu_context * ctx);
 	char*(*get_ls) (struct spu_context * ctx);
+	void (*privcntl_write) (struct spu_context *ctx, u64 data);
 	 u32 (*runcntl_read) (struct spu_context * ctx);
 	void (*runcntl_write) (struct spu_context * ctx, u32 data);
+	void (*runcntl_stop) (struct spu_context * ctx);
 	void (*master_start) (struct spu_context * ctx);
 	void (*master_stop) (struct spu_context * ctx);
 	int (*set_mfc_query)(struct spu_context * ctx, u32 mask, u32 mode);
@@ -219,15 +223,16 @@ void spu_gang_add_ctx(struct spu_gang *gang, struct spu_context *ctx);
 
 /* fault handling */
 int spufs_handle_class1(struct spu_context *ctx);
+int spufs_handle_class0(struct spu_context *ctx);
 
 /* affinity */
 struct spu *affinity_check(struct spu_context *ctx);
 
 /* context management */
 extern atomic_t nr_spu_contexts;
-static inline void spu_acquire(struct spu_context *ctx)
+static inline int __must_check spu_acquire(struct spu_context *ctx)
 {
-	mutex_lock(&ctx->state_mutex);
+	return mutex_lock_interruptible(&ctx->state_mutex);
 }
 
 static inline void spu_release(struct spu_context *ctx)
@@ -242,10 +247,11 @@ int put_spu_context(struct spu_context *ctx);
 void spu_unmap_mappings(struct spu_context *ctx);
 
 void spu_forget(struct spu_context *ctx);
-int spu_acquire_runnable(struct spu_context *ctx, unsigned long flags);
-void spu_acquire_saved(struct spu_context *ctx);
+int __must_check spu_acquire_saved(struct spu_context *ctx);
 void spu_release_saved(struct spu_context *ctx);
 
+int spu_stopped(struct spu_context *ctx, u32 * stat);
+void spu_del_from_rq(struct spu_context *ctx);
 int spu_activate(struct spu_context *ctx, unsigned long flags);
 void spu_deactivate(struct spu_context *ctx);
 void spu_yield(struct spu_context *ctx);
@@ -263,6 +269,9 @@ extern char *isolated_loader;
  *	Same as wait_event_interruptible(), except that here
  *	we need to call spu_release(ctx) before sleeping, and
  *	then spu_acquire(ctx) when awoken.
+ *
+ * 	Returns with state_mutex re-acquired when successfull or
+ * 	with -ERESTARTSYS and the state_mutex dropped when interrupted.
  */
 
 #define spufs_wait(wq, condition)					\
@@ -273,13 +282,15 @@ extern char *isolated_loader;
 		prepare_to_wait(&(wq), &__wait, TASK_INTERRUPTIBLE);	\
 		if (condition)						\
 			break;						\
+		spu_release(ctx);					\
 		if (signal_pending(current)) {				\
 			__ret = -ERESTARTSYS;				\
 			break;						\
 		}							\
-		spu_release(ctx);					\
 		schedule();						\
-		spu_acquire(ctx);					\
+		__ret = spu_acquire(ctx);				\
+		if (__ret)						\
+			break;						\
 	}								\
 	finish_wait(&(wq), &__wait);					\
 	__ret;								\
@@ -306,41 +317,16 @@ struct spufs_coredump_reader {
 extern struct spufs_coredump_reader spufs_coredump_read[];
 extern int spufs_coredump_num_notes;
 
-/*
- * This function is a little bit too large for an inline, but
- * as fault.c is built into the kernel we can't move it out of
- * line.
- */
-static inline void spuctx_switch_state(struct spu_context *ctx,
-		enum spu_utilization_state new_state)
-{
-	unsigned long long curtime;
-	signed long long delta;
-	struct timespec ts;
-	struct spu *spu;
-	enum spu_utilization_state old_state;
-
-	ktime_get_ts(&ts);
-	curtime = timespec_to_ns(&ts);
-	delta = curtime - ctx->stats.tstamp;
-
-	WARN_ON(!mutex_is_locked(&ctx->state_mutex));
-	WARN_ON(delta < 0);
-
-	spu = ctx->spu;
-	old_state = ctx->stats.util_state;
-	ctx->stats.util_state = new_state;
-	ctx->stats.tstamp = curtime;
-
-	/*
-	 * Update the physical SPU utilization statistics.
-	 */
-	if (spu) {
-		ctx->stats.times[old_state] += delta;
-		spu->stats.times[old_state] += delta;
-		spu->stats.util_state = new_state;
-		spu->stats.tstamp = curtime;
-	}
-}
+extern int spu_init_csa(struct spu_state *csa);
+extern void spu_fini_csa(struct spu_state *csa);
+extern int spu_save(struct spu_state *prev, struct spu *spu);
+extern int spu_restore(struct spu_state *new, struct spu *spu);
+extern int spu_switch(struct spu_state *prev, struct spu_state *new,
+		      struct spu *spu);
+extern int spu_alloc_lscsa(struct spu_state *csa);
+extern void spu_free_lscsa(struct spu_state *csa);
+
+extern void spuctx_switch_state(struct spu_context *ctx,
+		enum spu_utilization_state new_state);
 
 #endif
diff --git a/arch/powerpc/platforms/cell/spufs/switch.c b/arch/powerpc/platforms/cell/spufs/switch.c
index 233df8e..e9dc7a5 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -34,6 +34,7 @@
 
 #include <linux/module.h>
 #include <linux/errno.h>
+#include <linux/hardirq.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -48,6 +49,8 @@
 #include <asm/spu_csa.h>
 #include <asm/mmu_context.h>
 
+#include "spufs.h"
+
 #include "spu_save_dump.h"
 #include "spu_restore_dump.h"
 
@@ -115,6 +118,8 @@ static inline void disable_interrupts(struct spu_state *csa, struct spu *spu)
 	 *     Write INT_MASK_class1 with value of 0.
 	 *     Save INT_Mask_class2 in CSA.
 	 *     Write INT_MASK_class2 with value of 0.
+	 *     Synchronize all three interrupts to be sure
+	 *     we no longer execute a handler on another CPU.
 	 */
 	spin_lock_irq(&spu->register_lock);
 	if (csa) {
@@ -127,6 +132,9 @@ static inline void disable_interrupts(struct spu_state *csa, struct spu *spu)
 	spu_int_mask_set(spu, 2, 0ul);
 	eieio();
 	spin_unlock_irq(&spu->register_lock);
+	synchronize_irq(spu->irqs[0]);
+	synchronize_irq(spu->irqs[1]);
+	synchronize_irq(spu->irqs[2]);
 }
 
 static inline void set_watchdog_timer(struct spu_state *csa, struct spu *spu)
@@ -691,35 +699,9 @@ static inline void resume_mfc_queue(struct spu_state *csa, struct spu *spu)
 	out_be64(&priv2->mfc_control_RW, MFC_CNTL_RESUME_DMA_QUEUE);
 }
 
-static inline void get_kernel_slb(u64 ea, u64 slb[2])
+static inline void setup_mfc_slbs(struct spu_state *csa, struct spu *spu,
+		unsigned int *code, int code_size)
 {
-	u64 llp;
-
-	if (REGION_ID(ea) == KERNEL_REGION_ID)
-		llp = mmu_psize_defs[mmu_linear_psize].sllp;
-	else
-		llp = mmu_psize_defs[mmu_virtual_psize].sllp;
-	slb[0] = (get_kernel_vsid(ea) << SLB_VSID_SHIFT) |
-		SLB_VSID_KERNEL | llp;
-	slb[1] = (ea & ESID_MASK) | SLB_ESID_V;
-}
-
-static inline void load_mfc_slb(struct spu *spu, u64 slb[2], int slbe)
-{
-	struct spu_priv2 __iomem *priv2 = spu->priv2;
-
-	out_be64(&priv2->slb_index_W, slbe);
-	eieio();
-	out_be64(&priv2->slb_vsid_RW, slb[0]);
-	out_be64(&priv2->slb_esid_RW, slb[1]);
-	eieio();
-}
-
-static inline void setup_mfc_slbs(struct spu_state *csa, struct spu *spu)
-{
-	u64 code_slb[2];
-	u64 lscsa_slb[2];
-
 	/* Save, Step 47:
 	 * Restore, Step 30.
 	 *     If MFC_SR1[R]=1, write 0 to SLB_Invalidate_All
@@ -735,11 +717,7 @@ static inline void setup_mfc_slbs(struct spu_state *csa, struct spu *spu)
 	 *     translation is desired by OS environment).
 	 */
 	spu_invalidate_slbs(spu);
-	get_kernel_slb((unsigned long)&spu_save_code[0], code_slb);
-	get_kernel_slb((unsigned long)csa->lscsa, lscsa_slb);
-	load_mfc_slb(spu, code_slb, 0);
-	if ((lscsa_slb[0] != code_slb[0]) || (lscsa_slb[1] != code_slb[1]))
-		load_mfc_slb(spu, lscsa_slb, 1);
+	spu_setup_kernel_slbs(spu, csa->lscsa, code, code_size);
 }
 
 static inline void set_switch_active(struct spu_state *csa, struct spu *spu)
@@ -769,9 +747,9 @@ static inline void enable_interrupts(struct spu_state *csa, struct spu *spu)
 	 *     (translation) interrupts.
 	 */
 	spin_lock_irq(&spu->register_lock);
-	spu_int_stat_clear(spu, 0, ~0ul);
-	spu_int_stat_clear(spu, 1, ~0ul);
-	spu_int_stat_clear(spu, 2, ~0ul);
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 1, CLASS1_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	spu_int_mask_set(spu, 0, 0ul);
 	spu_int_mask_set(spu, 1, class1_mask);
 	spu_int_mask_set(spu, 2, 0ul);
@@ -928,8 +906,8 @@ static inline void wait_tag_complete(struct spu_state *csa, struct spu *spu)
 	POLL_WHILE_FALSE(in_be32(&prob->dma_tagstatus_R) & mask);
 
 	local_irq_save(flags);
-	spu_int_stat_clear(spu, 0, ~(0ul));
-	spu_int_stat_clear(spu, 2, ~(0ul));
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	local_irq_restore(flags);
 }
 
@@ -947,8 +925,8 @@ static inline void wait_spu_stopped(struct spu_state *csa, struct spu *spu)
 	POLL_WHILE_TRUE(in_be32(&prob->spu_status_R) & SPU_STATUS_RUNNING);
 
 	local_irq_save(flags);
-	spu_int_stat_clear(spu, 0, ~(0ul));
-	spu_int_stat_clear(spu, 2, ~(0ul));
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	local_irq_restore(flags);
 }
 
@@ -1424,9 +1402,9 @@ static inline void clear_interrupts(struct spu_state *csa, struct spu *spu)
 	spu_int_mask_set(spu, 0, 0ul);
 	spu_int_mask_set(spu, 1, 0ul);
 	spu_int_mask_set(spu, 2, 0ul);
-	spu_int_stat_clear(spu, 0, ~0ul);
-	spu_int_stat_clear(spu, 1, ~0ul);
-	spu_int_stat_clear(spu, 2, ~0ul);
+	spu_int_stat_clear(spu, 0, CLASS0_INTR_MASK);
+	spu_int_stat_clear(spu, 1, CLASS1_INTR_MASK);
+	spu_int_stat_clear(spu, 2, CLASS2_INTR_MASK);
 	spin_unlock_irq(&spu->register_lock);
 }
 
@@ -1866,7 +1844,8 @@ static void save_lscsa(struct spu_state *prev, struct spu *spu)
 	 */
 
 	resume_mfc_queue(prev, spu);	/* Step 46. */
-	setup_mfc_slbs(prev, spu);	/* Step 47. */
+	/* Step 47. */
+	setup_mfc_slbs(prev, spu, spu_save_code, sizeof(spu_save_code));
 	set_switch_active(prev, spu);	/* Step 48. */
 	enable_interrupts(prev, spu);	/* Step 49. */
 	save_ls_16kb(prev, spu);	/* Step 50. */
@@ -1971,7 +1950,8 @@ static void restore_lscsa(struct spu_state *next, struct spu *spu)
 	setup_spu_status_part1(next, spu);	/* Step 27. */
 	setup_spu_status_part2(next, spu);	/* Step 28. */
 	restore_mfc_rag(next, spu);	        /* Step 29. */
-	setup_mfc_slbs(next, spu);	        /* Step 30. */
+	/* Step 30. */
+	setup_mfc_slbs(next, spu, spu_restore_code, sizeof(spu_restore_code));
 	set_spu_npc(next, spu);	                /* Step 31. */
 	set_signot1(next, spu);	                /* Step 32. */
 	set_signot2(next, spu);	                /* Step 33. */
@@ -2103,10 +2083,6 @@ int spu_save(struct spu_state *prev, struct spu *spu)
 	int rc;
 
 	acquire_spu_lock(spu);	        /* Step 1.     */
-	prev->dar = spu->dar;
-	prev->dsisr = spu->dsisr;
-	spu->dar = 0;
-	spu->dsisr = 0;
 	rc = __do_spu_save(prev, spu);	/* Steps 2-53. */
 	release_spu_lock(spu);
 	if (rc != 0 && rc != 2 && rc != 6) {
@@ -2133,9 +2109,6 @@ int spu_restore(struct spu_state *new, struct spu *spu)
 	acquire_spu_lock(spu);
 	harvest(NULL, spu);
 	spu->slb_replace = 0;
-	new->dar = 0;
-	new->dsisr = 0;
-	spu->class_0_pending_value = 0;
 	rc = __do_spu_restore(new, spu);
 	release_spu_lock(spu);
 	if (rc) {
@@ -2215,10 +2188,8 @@ int spu_init_csa(struct spu_state *csa)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(spu_init_csa);
 
 void spu_fini_csa(struct spu_state *csa)
 {
 	spu_free_lscsa(csa);
 }
-EXPORT_SYMBOL_GPL(spu_fini_csa);
diff --git a/include/asm-powerpc/spu.h b/include/asm-powerpc/spu.h
index ddd23db..2b76090 100644
--- a/include/asm-powerpc/spu.h
+++ b/include/asm-powerpc/spu.h
@@ -103,6 +103,7 @@
 
 struct spu_context;
 struct spu_runqueue;
+struct spu_lscsa;
 struct device_node;
 
 enum spu_utilization_state {
@@ -158,8 +159,6 @@ struct spu {
 	enum { SPU_FREE, SPU_USED } alloc_state;
 	u64 class_0_pending_value;
 
-	void (* dma_callback)(struct spu *spu, int type);
-
 	pid_t tgid;
 	int has_mem_affinity;
 	struct list_head aff_list;
@@ -205,10 +204,19 @@ struct cbe_spu_info {
 extern struct cbe_spu_info cbe_spu_info[];
 
 void spu_init_channels(struct spu *spu);
-int spu_irq_class_0_bottom(struct spu *spu);
-int spu_irq_class_1_bottom(struct spu *spu);
 void spu_irq_setaffinity(struct spu *spu, int cpu);
 
+void spu_setup_kernel_slbs(struct spu *spu, struct spu_lscsa *lscsa,
+		void *code, int code_size);
+
+#ifdef CONFIG_KEXEC
+void crash_register_spus(struct list_head *list);
+#else
+static inline void crash_register_spus(struct list_head *list)
+{
+}
+#endif
+
 #ifdef CONFIG_KEXEC
 void crash_register_spus(struct list_head *list);
 #else
@@ -227,6 +235,7 @@ static inline void crash_register_spus(struct list_head *list)
 
 extern void spu_invalidate_slbs(struct spu *spu);
 extern void spu_associate_mm(struct spu *spu, struct mm_struct *mm);
+int spu_64k_pages_available(void);
 
 /* Calls from the memory management to the SPU */
 struct mm_struct;
@@ -296,6 +305,8 @@ void spu_remove_sysdev_attr(struct sysdev_attribute *attr);
 int spu_add_sysdev_attr_group(struct attribute_group *attrs);
 void spu_remove_sysdev_attr_group(struct attribute_group *attrs);
 
+int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
+		unsigned long dsisr, unsigned *flt);
 
 /*
  * Notifier blocks:
@@ -320,7 +331,7 @@ extern void notify_spus_active(void);
 extern void do_notify_spus_active(void);
 
 /*
- * This defines the Local Store, Problem Area and Privlege Area of an SPU.
+ * This defines the Local Store, Problem Area and Privilege Area of an SPU.
  */
 
 union mfc_tag_size_class_cmd {
@@ -541,8 +552,24 @@ struct spu_priv1 {
 #define CLASS2_ENABLE_SPU_STOP_INTR			0x2L
 #define CLASS2_ENABLE_SPU_HALT_INTR			0x4L
 #define CLASS2_ENABLE_SPU_DMA_TAG_GROUP_COMPLETE_INTR	0x8L
+#define CLASS2_ENABLE_MAILBOX_THRESHOLD_INTR		0x10L
 	u8  pad_0x118_0x140[0x28];				/* 0x118 */
 	u64 int_stat_RW[3];					/* 0x140 */
+#define CLASS0_DMA_ALIGNMENT_INTR			0x1L
+#define CLASS0_INVALID_DMA_COMMAND_INTR			0x2L
+#define CLASS0_SPU_ERROR_INTR				0x4L
+#define CLASS0_INTR_MASK				0x7L
+#define CLASS1_SEGMENT_FAULT_INTR			0x1L
+#define CLASS1_STORAGE_FAULT_INTR			0x2L
+#define CLASS1_LS_COMPARE_SUSPEND_ON_GET_INTR		0x4L
+#define CLASS1_LS_COMPARE_SUSPEND_ON_PUT_INTR		0x8L
+#define CLASS1_INTR_MASK				0xfL
+#define CLASS2_MAILBOX_INTR				0x1L
+#define CLASS2_SPU_STOP_INTR				0x2L
+#define CLASS2_SPU_HALT_INTR				0x4L
+#define CLASS2_SPU_DMA_TAG_GROUP_COMPLETE_INTR		0x8L
+#define CLASS2_MAILBOX_THRESHOLD_INTR			0x10L
+#define CLASS2_INTR_MASK				0x1fL
 	u8  pad_0x158_0x180[0x28];				/* 0x158 */
 	u64 int_route_RW;					/* 0x180 */
 
diff --git a/include/asm-powerpc/spu_csa.h b/include/asm-powerpc/spu_csa.h
index 01a3b93..0ab6bff 100644
--- a/include/asm-powerpc/spu_csa.h
+++ b/include/asm-powerpc/spu_csa.h
@@ -194,7 +194,7 @@ struct spu_priv1_collapsed {
 };
 
 /*
- * struct spu_priv2_collapsed - condensed priviliged 2 area, w/o pads.
+ * struct spu_priv2_collapsed - condensed privileged 2 area, w/o pads.
  */
 struct spu_priv2_collapsed {
 	u64 slb_index_W;
@@ -241,6 +241,12 @@ struct spu_priv2_collapsed {
  */
 struct spu_state {
 	struct spu_lscsa *lscsa;
+#ifdef CONFIG_SPU_FS_64K_LS
+	int		use_big_pages;
+	/* One struct page per 64k page */
+#define SPU_LSCSA_NUM_BIG_PAGES	(sizeof(struct spu_lscsa) / 0x10000)
+	struct page	*lscsa_pages[SPU_LSCSA_NUM_BIG_PAGES];
+#endif
 	struct spu_problem_collapsed prob;
 	struct spu_priv1_collapsed priv1;
 	struct spu_priv2_collapsed priv2;
@@ -248,20 +254,11 @@ struct spu_state {
 	u64 spu_chnldata_RW[32];
 	u32 spu_mailbox_data[4];
 	u32 pu_mailbox_data[1];
-	u64 dar, dsisr;
+	u64 dar, dsisr, class_0_pending;
 	unsigned long suspend_time;
 	spinlock_t register_lock;
 };
 
-extern int spu_init_csa(struct spu_state *csa);
-extern void spu_fini_csa(struct spu_state *csa);
-extern int spu_save(struct spu_state *prev, struct spu *spu);
-extern int spu_restore(struct spu_state *new, struct spu *spu);
-extern int spu_switch(struct spu_state *prev, struct spu_state *new,
-		      struct spu *spu);
-extern int spu_alloc_lscsa(struct spu_state *csa);
-extern void spu_free_lscsa(struct spu_state *csa);
-
 #endif /* !__SPU__ */
 #endif /* __KERNEL__ */
 #endif /* !__ASSEMBLY__ */
diff --git a/include/asm-powerpc/spu_priv1.h b/include/asm-powerpc/spu_priv1.h
index 5663789..f644751 100644
--- a/include/asm-powerpc/spu_priv1.h
+++ b/include/asm-powerpc/spu_priv1.h
@@ -24,6 +24,7 @@
 #include <linux/types.h>
 
 struct spu;
+struct spu_context;
 
 /* access to priv1 registers */
 
@@ -179,6 +180,9 @@ struct spu_management_ops {
 	int (*create_spu)(struct spu *spu, void *data);
 	int (*destroy_spu)(struct spu *spu);
 	int (*init_affinity)(void);
+	/* Additions for RHEL5U3 */
+	void (*enable_spu)(struct spu_context *ctx);
+	void (*disable_spu)(struct spu_context *ctx);
 };
 
 extern const struct spu_management_ops* spu_management_ops;
@@ -207,6 +211,18 @@ spu_init_affinity (void)
 	return spu_management_ops->init_affinity();
 }
 
+static inline void
+spu_enable_spu (struct spu_context *ctx)
+{
+	spu_management_ops->enable_spu(ctx);
+}
+
+static inline void
+spu_disable_spu (struct spu_context *ctx)
+{
+	spu_management_ops->disable_spu(ctx);
+}
+
 /*
  * The declarations folowing are put here for convenience
  * and only intended to be used by the platform setup code.
diff --git a/include/linux/list.h b/include/linux/list.h
index f96bd38..244e1d6 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -533,6 +533,20 @@ static inline void list_splice_init_rcu(struct list_head *list,
 	     pos = list_entry(pos->member.next, typeof(*pos), member))
 
 /**
+ * list_for_each_entry_continue_reverse - iterate backwards from the given point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Start to iterate over list of given type backwards, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue_reverse(pos, head, member)		\
+	for (pos = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     prefetch(pos->member.prev), &pos->member != (head);	\
+	     pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
  * list_for_each_entry_from - iterate over list of given type from the current point
  * @pos:	the type * to use as a loop cursor.
  * @head:	the head for your list.