Path: utzoo!utgpu!jarvis.csri.toronto.edu!rutgers!tut.cis.ohio-state.edu!G.OSWEGO.EDU!dl
From: dl@G.OSWEGO.EDU (Doug Lea)
Newsgroups: gnu.g++.lib.bug
Subject: Re: lack of inline functions in libg++
Message-ID: <8910261030.AA01450@g.oswego.edu>
Date: 26 Oct 89 10:30:30 GMT
Sender: daemon@tut.cis.ohio-state.edu
Reply-To: dl@oswego.oswego.edu
Distribution: gnu
Organization: GNUs Not Usenet
Lines: 126


Michael said...

> Another solution to coding around the cost of function calls is to use
> a RISC workstantion where that cost is negligible

But this is only part of the story. The *primary* goal of inlining on
RISC machine like a SPARC is not really saving a cycle or two for the
function call, but rather `procedural integration'. If inlines reveal
to the compiler some important constraints, constants, etc., that make
further simplifications and optimizations possible, it can be a HUGE
win. Try this:


#include <builtin.h>
#include <stream.h>

class Vec1000000
{
  float    s[1000000];
public:
           Vec1000000() {}
          ~Vec1000000() {}

  int      size();
  float&   sub(int i);

  int      ni_size();    // non-inline versions
  float&   ni_sub(int i);

  virtual int      v_size();  // virtual non-inline versions
  virtual float&   v_sub(int i);
 
};

inline int Vec1000000::size()    { return 1000000; }
       int Vec1000000::ni_size() { return 1000000; }
       int Vec1000000::v_size()  { return 1000000; }

inline float& Vec1000000::sub(int i)    { return s[i]; }
       float& Vec1000000::ni_sub(int i) { return s[i]; }
       float& Vec1000000::v_sub(int i)  { return s[i]; }


void fill(Vec1000000& v, float x)
{
  for (int i = 0; i < v.size(); ++i) v.sub(i) = x;
}

void ni_fill(Vec1000000& v, float x)
{
  for (int i = 0; i < v.ni_size(); ++i) v.ni_sub(i) = x;
}

void v_fill(Vec1000000& v, float x)
{
  for (int i = 0; i < v.v_size(); ++i) v.v_sub(i) = x;
}

main()
{
  Vec1000000 v;

  start_timer();  fill(v, 1.0);
  cout << "fill time = " << return_elapsed_time(0.0) << "\n";
  start_timer();  ni_fill(v, 1.0);
  cout << "ni_fill time = " << return_elapsed_time(0.0) << "\n";
  start_timer();  v_fill(v, 1.0);
  cout << "v_fill time = " << return_elapsed_time(0.0) << "\n";
}


g++ -g -O -fstrength-reduce -fdelayed-branch t1026.cc -lg++

Compilation finished at Thu Oct 26 05:51:39

(on a Sun4/110)

g.oswego.edu% a.out
fill time = 0.71
ni_fill time = 1.93
v_fill time = 4.41
g.oswego.edu% !!
a.out
fill time = 0.63
ni_fill time = 1.9
v_fill time = 3.58
g.oswego.edu% !!
a.out
fill time = 0.58
ni_fill time = 1.89
v_fill time = 4.41
g.oswego.edu% 


So inlining gave a > 2.5X speedup over noninlines, and > 5X over
virtuals on the Sparc. It's not too hard to make examples where it's
closer to 10X. 

The code generated for inline fill() is  about the best you could get
anywhere:

_fill__FR10Vec1000000f:
	!#PROLOGUE# 0
	save %sp,-112,%sp
	!#PROLOGUE# 1
	mov %i1,%o2
	sethi %hi(1000000),%o0
	or %lo(1000000),%o0,%o0
	mov %i0,%o1
	sll %o0,2,%o0
	b L672
	add %o0,%i0,%o0
L677:
	st %o2,[%o1]
	add %o1,4,%o1
L672:
	cmp %o1,%o0
	bl L677
	nop
	ret
	restore


-Doug