|
|
|
/*
|
|
|
|
* Copyright 2010 Tilera Corporation. All Rights Reserved.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation, version 2.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful, but
|
|
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
|
|
|
* NON INFRINGEMENT. See the GNU General Public License for
|
|
|
|
* more details.
|
|
|
|
*
|
|
|
|
* Support the cycle counter clocksource and tile timer clock event device.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/time.h>
|
|
|
|
#include <linux/timex.h>
|
|
|
|
#include <linux/clocksource.h>
|
|
|
|
#include <linux/clockchips.h>
|
|
|
|
#include <linux/hardirq.h>
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/sched/clock.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/delay.h>
|
arch/tile: various header improvements for building drivers
This change adds a number of missing headers in asm (fb.h, parport.h,
serial.h, and vga.h) using the minimal generic versions.
It also adds a number of missing interfaces that showed up as build
failures when trying to build various drivers not normally included in the
"tile" distribution: ioremap_wc(), memset_io(), io{read,write}{16,32}be(),
virt_to_bus(), bus_to_virt(), irq_canonicalize(), __pte(), __pgd(),
and __pmd(). I also added a cast in virt_to_page() since not all callers
pass a pointer.
I fixed <asm/stat.h> to properly include a __KERNEL__ guard for the
__ARCH_WANT_STAT64 symbol, and <asm/swab.h> to use __builtin_bswap32()
even for our 64-bit architecture, since the same code is produced.
I added an export for get_cycles(), since it's used in some modules.
And I made <arch/spr_def.h> properly include the __KERNEL__ guard,
even though it's not yet exported, since it likely will be soon.
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
14 years ago
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/timekeeper_internal.h>
|
|
|
|
#include <asm/irq_regs.h>
|
|
|
|
#include <asm/traps.h>
|
|
|
|
#include <asm/vdso.h>
|
|
|
|
#include <hv/hypervisor.h>
|
|
|
|
#include <arch/interrupts.h>
|
|
|
|
#include <arch/spr_def.h>
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Define the cycle counter clock source.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* How many cycles per second we are running at. */
|
|
|
|
static cycles_t cycles_per_sec __ro_after_init;
|
|
|
|
|
|
|
|
cycles_t get_clock_rate(void)
|
|
|
|
{
|
|
|
|
return cycles_per_sec;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if CHIP_HAS_SPLIT_CYCLE()
|
|
|
|
cycles_t get_cycles(void)
|
|
|
|
{
|
|
|
|
unsigned int high = __insn_mfspr(SPR_CYCLE_HIGH);
|
|
|
|
unsigned int low = __insn_mfspr(SPR_CYCLE_LOW);
|
|
|
|
unsigned int high2 = __insn_mfspr(SPR_CYCLE_HIGH);
|
|
|
|
|
|
|
|
while (unlikely(high != high2)) {
|
|
|
|
low = __insn_mfspr(SPR_CYCLE_LOW);
|
|
|
|
high = high2;
|
|
|
|
high2 = __insn_mfspr(SPR_CYCLE_HIGH);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (((cycles_t)high) << 32) | low;
|
|
|
|
}
|
arch/tile: various header improvements for building drivers
This change adds a number of missing headers in asm (fb.h, parport.h,
serial.h, and vga.h) using the minimal generic versions.
It also adds a number of missing interfaces that showed up as build
failures when trying to build various drivers not normally included in the
"tile" distribution: ioremap_wc(), memset_io(), io{read,write}{16,32}be(),
virt_to_bus(), bus_to_virt(), irq_canonicalize(), __pte(), __pgd(),
and __pmd(). I also added a cast in virt_to_page() since not all callers
pass a pointer.
I fixed <asm/stat.h> to properly include a __KERNEL__ guard for the
__ARCH_WANT_STAT64 symbol, and <asm/swab.h> to use __builtin_bswap32()
even for our 64-bit architecture, since the same code is produced.
I added an export for get_cycles(), since it's used in some modules.
And I made <arch/spr_def.h> properly include the __KERNEL__ guard,
even though it's not yet exported, since it likely will be soon.
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
14 years ago
|
|
|
EXPORT_SYMBOL(get_cycles);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We use a relatively small shift value so that sched_clock()
|
|
|
|
* won't wrap around very often.
|
|
|
|
*/
|
|
|
|
#define SCHED_CLOCK_SHIFT 10
|
|
|
|
|
|
|
|
static unsigned long sched_clock_mult __ro_after_init;
|
|
|
|
|
|
|
|
static cycles_t clocksource_get_cycles(struct clocksource *cs)
|
|
|
|
{
|
|
|
|
return get_cycles();
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct clocksource cycle_counter_cs = {
|
|
|
|
.name = "cycle counter",
|
|
|
|
.rating = 300,
|
|
|
|
.read = clocksource_get_cycles,
|
|
|
|
.mask = CLOCKSOURCE_MASK(64),
|
|
|
|
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called very early from setup_arch() to set cycles_per_sec.
|
|
|
|
* We initialize it early so we can use it to set up loops_per_jiffy.
|
|
|
|
*/
|
|
|
|
void __init setup_clock(void)
|
|
|
|
{
|
|
|
|
cycles_per_sec = hv_sysconf(HV_SYSCONF_CPU_SPEED);
|
|
|
|
sched_clock_mult =
|
|
|
|
clocksource_hz2mult(cycles_per_sec, SCHED_CLOCK_SHIFT);
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init calibrate_delay(void)
|
|
|
|
{
|
|
|
|
loops_per_jiffy = get_clock_rate() / HZ;
|
|
|
|
pr_info("Clock rate yields %lu.%02lu BogoMIPS (lpj=%lu)\n",
|
|
|
|
loops_per_jiffy / (500000 / HZ),
|
|
|
|
(loops_per_jiffy / (5000 / HZ)) % 100, loops_per_jiffy);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Called fairly late in init/main.c, but before we go smp. */
|
|
|
|
void __init time_init(void)
|
|
|
|
{
|
|
|
|
/* Initialize and register the clock source. */
|
|
|
|
clocksource_register_hz(&cycle_counter_cs, cycles_per_sec);
|
|
|
|
|
|
|
|
/* Start up the tile-timer interrupt source on the boot cpu. */
|
|
|
|
setup_tile_timer();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Define the tile timer clock event device. The timer is driven by
|
|
|
|
* the TILE_TIMER_CONTROL register, which consists of a 31-bit down
|
|
|
|
* counter, plus bit 31, which signifies that the counter has wrapped
|
|
|
|
* from zero to (2**31) - 1. The INT_TILE_TIMER interrupt will be
|
|
|
|
* raised as long as bit 31 is set.
|
|
|
|
*
|
|
|
|
* The TILE_MINSEC value represents the largest range of real-time
|
|
|
|
* we can possibly cover with the timer, based on MAX_TICK combined
|
|
|
|
* with the slowest reasonable clock rate we might run at.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define MAX_TICK 0x7fffffff /* we have 31 bits of countdown timer */
|
|
|
|
#define TILE_MINSEC 5 /* timer covers no more than 5 seconds */
|
|
|
|
|
|
|
|
static int tile_timer_set_next_event(unsigned long ticks,
|
|
|
|
struct clock_event_device *evt)
|
|
|
|
{
|
|
|
|
BUG_ON(ticks > MAX_TICK);
|
|
|
|
__insn_mtspr(SPR_TILE_TIMER_CONTROL, ticks);
|
|
|
|
arch_local_irq_unmask_now(INT_TILE_TIMER);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Whenever anyone tries to change modes, we just mask interrupts
|
|
|
|
* and wait for the next event to get set.
|
|
|
|
*/
|
|
|
|
static int tile_timer_shutdown(struct clock_event_device *evt)
|
|
|
|
{
|
|
|
|
arch_local_irq_mask_now(INT_TILE_TIMER);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set min_delta_ns to 1 microsecond, since it takes about
|
|
|
|
* that long to fire the interrupt.
|
|
|
|
*/
|
|
|
|
static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = {
|
|
|
|
.name = "tile timer",
|
|
|
|
.features = CLOCK_EVT_FEAT_ONESHOT,
|
|
|
|
.min_delta_ns = 1000,
|
|
|
|
.min_delta_ticks = 1,
|
|
|
|
.max_delta_ticks = MAX_TICK,
|
|
|
|
.rating = 100,
|
|
|
|
.irq = -1,
|
|
|
|
.set_next_event = tile_timer_set_next_event,
|
|
|
|
.set_state_shutdown = tile_timer_shutdown,
|
|
|
|
.set_state_oneshot = tile_timer_shutdown,
|
|
|
|
.set_state_oneshot_stopped = tile_timer_shutdown,
|
|
|
|
.tick_resume = tile_timer_shutdown,
|
|
|
|
};
|
|
|
|
|
|
|
|
void setup_tile_timer(void)
|
|
|
|
{
|
tile: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.
The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e. using a global
register that may be set to the per cpu base.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
11 years ago
|
|
|
struct clock_event_device *evt = this_cpu_ptr(&tile_timer);
|
|
|
|
|
|
|
|
/* Fill in fields that are speed-specific. */
|
|
|
|
clockevents_calc_mult_shift(evt, cycles_per_sec, TILE_MINSEC);
|
|
|
|
evt->max_delta_ns = clockevent_delta2ns(MAX_TICK, evt);
|
|
|
|
|
|
|
|
/* Mark as being for this cpu only. */
|
|
|
|
evt->cpumask = cpumask_of(smp_processor_id());
|
|
|
|
|
|
|
|
/* Start out with timer not firing. */
|
|
|
|
arch_local_irq_mask_now(INT_TILE_TIMER);
|
|
|
|
|
|
|
|
/* Register tile timer. */
|
|
|
|
clockevents_register_device(evt);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Called from the interrupt vector. */
|
|
|
|
void do_timer_interrupt(struct pt_regs *regs, int fault_num)
|
|
|
|
{
|
|
|
|
struct pt_regs *old_regs = set_irq_regs(regs);
|
tile: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.
The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e. using a global
register that may be set to the per cpu base.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
11 years ago
|
|
|
struct clock_event_device *evt = this_cpu_ptr(&tile_timer);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mask the timer interrupt here, since we are a oneshot timer
|
|
|
|
* and there are now by definition no events pending.
|
|
|
|
*/
|
|
|
|
arch_local_irq_mask(INT_TILE_TIMER);
|
|
|
|
|
|
|
|
/* Track time spent here in an interrupt context */
|
|
|
|
irq_enter();
|
|
|
|
|
|
|
|
/* Track interrupt count. */
|
tile: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.
The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e. using a global
register that may be set to the per cpu base.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
11 years ago
|
|
|
__this_cpu_inc(irq_stat.irq_timer_count);
|
|
|
|
|
|
|
|
/* Call the generic timer handler */
|
|
|
|
evt->event_handler(evt);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Track time spent against the current process again and
|
|
|
|
* process any softirqs if they are waiting.
|
|
|
|
*/
|
|
|
|
irq_exit();
|
|
|
|
|
|
|
|
set_irq_regs(old_regs);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scheduler clock - returns current time in nanosec units.
|
|
|
|
* Note that with LOCKDEP, this is called during lockdep_init(), and
|
|
|
|
* we will claim that sched_clock() is zero for a little while, until
|
|
|
|
* we run setup_clock(), above.
|
|
|
|
*/
|
|
|
|
unsigned long long sched_clock(void)
|
|
|
|
{
|
tile: avoid using clocksource_cyc2ns with absolute cycle count
For large values of "mult" and long uptimes, the intermediate
result of "cycles * mult" can overflow 64 bits. For example,
the tile platform calls clocksource_cyc2ns with a 1.2 GHz clock;
we have mult = 853, and after 208.5 days, we overflow 64 bits.
Since clocksource_cyc2ns() is intended to be used for relative
cycle counts, not absolute cycle counts, performance is more
importance than accepting a wider range of cycle values. So,
just use mult_frac() directly in tile's sched_clock().
Commit 4cecf6d401a0 ("sched, x86: Avoid unnecessary overflow
in sched_clock") by Salman Qazi results in essentially the same
generated code for x86 as this change does for tile. In fact,
a follow-on change by Salman introduced mult_frac() and switched
to using it, so the C code was largely identical at that point too.
Peter Zijlstra then added mul_u64_u32_shr() and switched x86
to use it. This is, in principle, better; by optimizing the
64x64->64 multiplies to be 32x32->64 multiplies we can potentially
save some time. However, the compiler piplines the 64x64->64
multiplies pretty well, and the conditional branch in the generic
mul_u64_u32_shr() causes some bubbles in execution, with the
result that it's pretty much a wash. If tilegx provided its own
implementation of mul_u64_u32_shr() without the conditional branch,
we could potentially save 3 cycles, but that seems like small gain
for a fair amount of additional build scaffolding; no other platform
currently provides a mul_u64_u32_shr() override, and tile doesn't
currently have an <asm/div64.h> header to put the override in.
Additionally, gcc currently has an optimization bug that prevents
it from recognizing the opportunity to use a 32x32->64 multiply,
and so the result would be no better than the existing mult_frac()
until such time as the compiler is fixed.
For now, just using mult_frac() seems like the right answer.
Cc: stable@kernel.org [v3.4+]
Signed-off-by: Chris Metcalf <cmetcalf@mellanox.com>
8 years ago
|
|
|
return mult_frac(get_cycles(),
|
|
|
|
sched_clock_mult, 1ULL << SCHED_CLOCK_SHIFT);
|
|
|
|
}
|
|
|
|
|
|
|
|
int setup_profiling_timer(unsigned int multiplier)
|
|
|
|
{
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use the tile timer to convert nsecs to core clock cycles, relying
|
|
|
|
* on it having the same frequency as SPR_CYCLE.
|
|
|
|
*/
|
|
|
|
cycles_t ns2cycles(unsigned long nsecs)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We do not have to disable preemption here as each core has the same
|
|
|
|
* clock frequency.
|
|
|
|
*/
|
tile: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.
The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e. using a global
register that may be set to the per cpu base.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
11 years ago
|
|
|
struct clock_event_device *dev = raw_cpu_ptr(&tile_timer);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* as in clocksource.h and x86's timer.h, we split the calculation
|
|
|
|
* into 2 parts to avoid unecessary overflow of the intermediate
|
|
|
|
* value. This will not lead to any loss of precision.
|
|
|
|
*/
|
|
|
|
u64 quot = (u64)nsecs >> dev->shift;
|
|
|
|
u64 rem = (u64)nsecs & ((1ULL << dev->shift) - 1);
|
|
|
|
return quot * dev->mult + ((rem * dev->mult) >> dev->shift);
|
|
|
|
}
|
|
|
|
|
|
|
|
void update_vsyscall_tz(void)
|
|
|
|
{
|
|
|
|
write_seqcount_begin(&vdso_data->tz_seq);
|
|
|
|
vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
|
|
|
|
vdso_data->tz_dsttime = sys_tz.tz_dsttime;
|
|
|
|
write_seqcount_end(&vdso_data->tz_seq);
|
|
|
|
}
|
|
|
|
|
|
|
|
void update_vsyscall(struct timekeeper *tk)
|
|
|
|
{
|
|
|
|
if (tk->tkr_mono.clock != &cycle_counter_cs)
|
|
|
|
return;
|
|
|
|
|
|
|
|
write_seqcount_begin(&vdso_data->tb_seq);
|
|
|
|
|
|
|
|
vdso_data->cycle_last = tk->tkr_mono.cycle_last;
|
|
|
|
vdso_data->mask = tk->tkr_mono.mask;
|
|
|
|
vdso_data->mult = tk->tkr_mono.mult;
|
|
|
|
vdso_data->shift = tk->tkr_mono.shift;
|
|
|
|
|
|
|
|
vdso_data->wall_time_sec = tk->xtime_sec;
|
|
|
|
vdso_data->wall_time_snsec = tk->tkr_mono.xtime_nsec;
|
|
|
|
|
|
|
|
vdso_data->monotonic_time_sec = tk->xtime_sec
|
|
|
|
+ tk->wall_to_monotonic.tv_sec;
|
|
|
|
vdso_data->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
|
|
|
|
+ ((u64)tk->wall_to_monotonic.tv_nsec
|
|
|
|
<< tk->tkr_mono.shift);
|
|
|
|
while (vdso_data->monotonic_time_snsec >=
|
|
|
|
(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
|
|
|
|
vdso_data->monotonic_time_snsec -=
|
|
|
|
((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
|
|
|
|
vdso_data->monotonic_time_sec++;
|
|
|
|
}
|
|
|
|
|
|
|
|
vdso_data->wall_time_coarse_sec = tk->xtime_sec;
|
|
|
|
vdso_data->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
|
|
|
|
tk->tkr_mono.shift);
|
|
|
|
|
|
|
|
vdso_data->monotonic_time_coarse_sec =
|
|
|
|
vdso_data->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
|
|
|
|
vdso_data->monotonic_time_coarse_nsec =
|
|
|
|
vdso_data->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
|
|
|
|
|
|
|
|
while (vdso_data->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
|
|
|
|
vdso_data->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
|
|
|
|
vdso_data->monotonic_time_coarse_sec++;
|
|
|
|
}
|
|
|
|
|
|
|
|
write_seqcount_end(&vdso_data->tb_seq);
|
|
|
|
}
|