Count occurrences of a substring

From Rosetta Code
Task
Count occurrences of a substring
You are encouraged to solve this task according to the task description, using any language you may know.

The task is to either create a function, or show a built-in function, to count the number of non-overlapping occurrences of a substring inside a string. The function should take two arguments: the first argument being the string to search and the second a substring to be search for. It should return an integer count.

<lang pseudocode>print countSubstring("the three truths","th") 3

// do not count substrings that overlap with previously-counted substrings: print countSubstring("ababababab","abab") 2</lang>

The matching should yield the highest number of non-overlapping matches. In general, this essentially means matching from left-to-right or right-to-left (see proof on talk page).


Ada

<lang Ada>with Ada.Strings.Fixed, Ada.Text_IO;

procedure Count_Substrings is

  function Substrings(Main: String; Sub: String) return Natural is
     Idx: Natural :=  Ada.Strings.Fixed.Index(Source => Main, Pattern => Sub);
  begin
     if Idx = 0 then
        return 0;
     else
        return 1 + Substrings(Main(Idx+Sub'Length .. Main'Last), Sub);
     end if;
  end Substrings;

begin

  Ada.Text_IO.Put(Integer'Image(Substrings("the three truths", "th")));
  Ada.Text_IO.Put(Integer'Image(Substrings("ababababab", "abab")));

end Count_Substrings; </lang>

Output:

 3 2

ALGOL 68

Works with: ALGOL 68 version Revision 1 - no extensions to language used.
Works with: ALGOL 68G version Any - tested with release 1.18.0-9h.tiny.

Algol68 has no build in function to do this task, hence the next to create a count string in string routine. <lang algol68>#!/usr/local/bin/a68g --script #

PROC count string in string = (STRING needle, haystack)INT: (

 INT start:=LWB haystack, next, out:=0;
 FOR count WHILE string in string(needle, next, haystack[start:]) DO
   start+:=next+UPB needle-LWB needle;
   out:=count
 OD;
 out

);

printf(($d" "$,

 count string in string("th", "the three truths"),    # expect 3 #
 count string in string("abab", "ababababab"),        # expect 2 #
 count string in string("a*b", "abaabba*bbaba*bbab"), # expect 2 #
 $l$

))</lang> Output:

3 2 2 

AutoHotkey

While it is simple enough to parse the string, AutoHotkey has a rather unconventional method which outperforms this. StringReplace sets the number of replaced strings to ErrorLevel. <lang AutoHotkey>MsgBox % countSubstring("the three truths","th") ; 3 MsgBox % countSubstring("ababababab","abab")  ; 2

CountSubstring(fullstring, substring){

  StringReplace, junk, fullstring, %substring%, , UseErrorLevel
  return errorlevel

}</lang>

AWK

<lang AWK>#!/usr/local/bin/awk -f

 function countsubstring (str,pat) 
 {
   n=0;	
   while (match(str,pat)) {
       n++;

str = substr(str,RSTART+RLENGTH);

   }
   return n;
 }
 BEGIN {
   print countsubstring("the three truths","th");
   print countsubstring("ababababab","abab");
   print countsubstring(ARGV[1],ARGV[2]);
 }</lang>

Output:

$ ./countsubstring.awk aaaabacad aa
3
2
2

BASIC

Works with: QBasic

In FreeBASIC, this needs to be compiled with -lang qb or -lang fblite.

<lang qbasic>DECLARE FUNCTION countSubstring& (where AS STRING, what AS STRING)

PRINT "the three truths, th:", countSubstring&("the three truths", "th") PRINT "ababababab, abab:", countSubstring&("ababababab", "abab")

FUNCTION countSubstring& (where AS STRING, what AS STRING)

   DIM c AS LONG, s AS LONG
   s = 1 - LEN(what)
   DO
       s = INSTR(s + LEN(what), where, what)
       IF 0 = s THEN EXIT DO
       c = c + 1
   LOOP
   countSubstring = c

END FUNCTION</lang>

Output:

the three truths, th:        3
ababababab, abab:            2

See also: Liberty BASIC, PowerBASIC, PureBasic.

BBC BASIC

<lang bbcbasic> tst$ = "the three truths"

     sub$ = "th"
     PRINT ; FNcountSubstring(tst$, sub$) " """ sub$ """ in """ tst$ """"
     tst$ = "ababababab"
     sub$ = "abab"
     PRINT ; FNcountSubstring(tst$, sub$) " """ sub$ """ in """ tst$ """"
     END
     
     DEF FNcountSubstring(A$, B$)
     LOCAL I%, N%
     I% = 1 : N% = 0
     REPEAT
       I% = INSTR(A$, B$, I%)
       IF I% THEN N% += 1 : I% += LEN(B$)
     UNTIL I% = 0
     = N%

</lang> Output:

3 "th" in "the three truths"
2 "abab" in "ababababab"

C

<lang C>#include <stdio.h>

  1. include <string.h>

int match(const char *s, const char *p, int overlap) {

       int c = 0, l = strlen(p);
       while (*s != '\0') {
               if (strncmp(s++, p, l)) continue;
               if (!overlap) s += l - 1;
               c++;
       }
       return c;

}

int main() {

       printf("%d\n", match("the three truths", "th", 0));
       printf("overlap:%d\n", match("abababababa", "aba", 1));
       printf("not:    %d\n", match("abababababa", "aba", 0));
       return 0;

}</lang>

C++

<lang cpp>#include <iostream>

  1. include <string>

// returns count of non-overlapping occurrences of 'sub' in 'str' int countSubstring(const std::string& str, const std::string& sub) {

   if (sub.length() == 0) return 0;
   int count = 0;
   for (size_t offset = 0;
       (offset < str.length()) && ((offset = str.find(sub, offset)) != std::string::npos);
       offset += sub.length())
   {
       ++count;
   }
   return count;

}

int main() {

   std::cout << countSubstring("the three truths", "th")    << '\n';
   std::cout << countSubstring("ababababab", "abab")        << '\n';
   std::cout << countSubstring("abaabba*bbaba*bbab", "a*b") << '\n';

}</lang> Output:

3
2
2

C#

<lang c sharp> using System;

class SubStringTestClass {

   public static int CountSubStrings(string testString, string testSubstring)
   (
       int location = 0;
       int count = 0;
       while (location < testString.Length)
       {
           int found = testString.IndexOf(testSubstring, location);
           if (found == -1)
           {
               break;
           }
           else
           {
               count++;
               location = location + testSubstring.Length;
           }
       }
       return count;
   }

}</lang>

CoffeeScript

<lang coffeescript> countSubstring = (str, substr) ->

 n = 0
 i = 0
 while (pos = str.indexOf(substr, i)) != -1
   n += 1
   i = pos + substr.length
 n

console.log countSubstring "the three truths", "th" console.log countSubstring "ababababab", "abab" </lang>

Common Lisp

<lang lisp>(defun count-sub (str pat)

 (loop with z = 0 with s = 0 while s do

(when (setf s (search pat str :start2 s)) (incf z) (incf s (length pat))) finally (return z)))

(count-sub "ababa" "ab")  ; 2 (count-sub "ababa" "aba") ; 1</lang>

D

<lang d>import std.stdio, std.algorithm;

void main() {

   writeln("the three truths".count("th"));
   writeln("ababababab".count("abab"));

}</lang> Output:

3
2

Delphi

<lang Delphi>program OccurrencesOfASubstring;

{$APPTYPE CONSOLE}

uses StrUtils;

function CountSubstring(const aString, aSubstring: string): Integer; var

 lPosition: Integer;

begin

 Result := 0;
 lPosition := PosEx(aSubstring, aString);
 while lPosition <> 0 do
 begin
   Inc(Result);
   lPosition := PosEx(aSubstring, aString, lPosition + Length(aSubstring));
 end;

end;

begin

 Writeln(CountSubstring('the three truths', 'th'));
 Writeln(CountSubstring('ababababab', 'abab'));

end.</lang>

Euphoria

<lang euphoria>function countSubstring(sequence s, sequence sub)

   integer from,count
   count = 0
   from = 1
   while 1 do
       from = match_from(sub,s,from)
       if not from then
           exit
       end if
       from += length(sub)
       count += 1
   end while
   return count

end function

? countSubstring("the three truths","th") ? countSubstring("ababababab","abab")</lang>

Output:

3
2

Fortran

Works with: Fortran version 90 and later

<lang fortran>program Example

 implicit none
 integer :: n
 
 n = countsubstring("the three truths", "th")
 write(*,*) n
 n = countsubstring("ababababab", "abab")
 write(*,*) n
 n = countsubstring("abaabba*bbaba*bbab", "a*b")
 write(*,*) n

contains

function countsubstring(s1, s2) result(c)

 character(*), intent(in) :: s1, s2
 integer :: c, p, posn

 c = 0
 if(len(s2) == 0) return
 p = 1
 do 
   posn = index(s1(p:), s2)
   if(posn == 0) return
   c = c + 1
   p = p + posn + len(s2)
 end do

end function end program</lang> Output

3
2
2

Go

Using strings.Count() method: <lang go>package main import (

       "fmt"
       "strings"

)

func main() {

       fmt.Println(strings.Count("the three truths", "th")) // says: 3
       fmt.Println(strings.Count("ababababab", "abab"))     // says: 2

}</lang>

Groovy

Solution, uses the Groovy "find" operator (=~), and the Groovy-extended Matcher property "count": <lang groovy>println (('the three truths' =~ /th/).count) println (('ababababab' =~ /abab/).count) println (('abaabba*bbaba*bbab' =~ /a*b/).count) println (('abaabba*bbaba*bbab' =~ /a\*b/).count)</lang>

Output:

3
2
9
2

Haskell

<lang haskell> import Data.Text hiding (length)

-- Return the number of non-overlapping occurrences of sub in str. countSubStrs str sub = length $ breakOnAll (pack sub) (pack str)

main = do

 print $ countSubStrs "the three truths" "th"
 print $ countSubStrs "ababababab" "abab"

</lang> Output:

3
2

Icon and Unicon

<lang Icon>procedure main() every A := ![ ["the three truths","th"], ["ababababab","abab"] ] do

  write("The string ",image(A[2])," occurs as a non-overlapping substring ",
        countSubstring!A , " times in ",image(A[1]))

end

procedure countSubstring(s1,s2) #: return count of non-overlapping substrings c := 0 s1 ? while tab(find(s2)) do {

  move(*s2)
  c +:= 1
  }

return c end</lang>

Output:

The string "th" occurs as a non-overlapping substring 3 times in "the three truths"
The string "abab" occurs as a non-overlapping substring 2 times in "ababababab"

J

<lang j>require'strings' countss=: #@] %~ #@[ - [ #@rplc ;~]</lang>

In other words: find length of original string, replace the string to be counted with the empty string, find the difference in lengths and divide by the length of the string to be counted.

Example use:

<lang j> 'the three truths' countss 'th' 3

  'ababababab' countss 'abab'

2</lang>

Java

Works with: Java version 1.5+

The "remove and count the difference" method: <lang java>public class CountSubstring { public static int countSubstring(String subStr, String str){ return (str.length() - str.replace(subStr, "").length()) / subStr.length(); }

public static void main(String[] args){ System.out.println(countSubstring("th", "the three truths")); System.out.println(countSubstring("abab", "ababababab")); System.out.println(countSubstring("a*b", "abaabba*bbaba*bbab")); } }</lang> Output:

3
2
2


Works with: Java version 1.5+

The "split and count" method: <lang java>import java.util.regex.Pattern;

public class CountSubstring { public static int countSubstring(String subStr, String str){ // the result of split() will contain one more element than the delimiter // the "-1" second argument makes it not discard trailing empty strings return str.split(Pattern.quote(subStr), -1).length - 1; }

public static void main(String[] args){ System.out.println(countSubstring("th", "the three truths")); System.out.println(countSubstring("abab", "ababababab")); System.out.println(countSubstring("a*b", "abaabba*bbaba*bbab")); } }</lang> Output:

3
2
2


Works with: Java version 1.5+

This method does a match and counts how many times it matches <lang java>import java.util.regex.Pattern; import java.util.regex.Matcher;

public class CountSubstring { public static int countSubstring(String subStr, String str){ int ans = 0; Matcher m = Pattern.compile(Pattern.quote(subStr)).matcher(str); while (m.find()) ans++;//count it return ans; }

public static void main(String[] args){ System.out.println(countSubstring("th", "the three truths")); System.out.println(countSubstring("abab", "ababababab")); System.out.println(countSubstring("a*b", "abaabba*bbaba*bbab")); } }</lang> Output:

3
2
2

JavaScript

Using regexes: <lang javascript>function countSubstring(str, subStr){ return str.match(new RegExp(subStr, "g")).length }</lang>

K

The dyadic verb _ss gives the positions of substring y in string x. <lang K> "the three truths" _ss "th" 0 4 13

 #"the three truths" _ss "th"

3

 "ababababab" _ss "abab"

0 4

 #"ababababab" _ss "abab"

2 </lang>


Liberty BASIC

<lang lb> print countSubstring( "the three truths", "th") print countSubstring( "ababababab", "abab") end

function countSubstring( a$, s$)

   c =0
   la =len( a$)
   ls =len( s$)
   for i =1 to la -ls
       if mid$( a$, i, ls) =s$ then c =c +1: i =i +ls -1
   next i
   countSubstring =c

end function </lang>

Lua

<lang lua>function Count_Substring( s1, s2 )

local magic =  "[%^%$%(%)%.%[%]%*%+%-%?]"
local percent = function(s)return "%"..s end
   return select( 2, s1:gsub( s2:gsub(magic,percent), "" ) )

end

print( Count_Substring( "the three truths", "th" ) ) print( Count_Substring( "ababababab","abab" ) )</lang>

3
2

Mathematica

<lang Mathematica>StringPosition["the three truths","th",Overlaps->False]//Length 3 StringPosition["ababababab","abab",Overlaps->False]//Length 2</lang>

MATLAB / Octave

<lang Matlab>  % Count occurrences of a substring without overlap

 length(findstr("ababababab","abab",0))
 length(findstr("the three truths","th",0))
 % Count occurrences of a substring with overlap
 length(findstr("ababababab","abab",1)) </lang>

Output:

>>   % Count occurrences of a substring without overlap
>>   length(findstr("ababababab","abab",0))
ans =  2
>>   length(findstr("the three truths","th",0))
ans =  3
>>   % Count occurrences of a substring with overlap
>>   length(findstr("ababababab","abab",1))
ans =  4
>> 

Mirah

<lang mirah>import java.util.regex.Pattern import java.util.regex.Matcher

  1. The "remove and count the difference" method

def count_substring(pattern:string, source:string)

   (source.length() - source.replace(pattern, "").length()) / pattern.length()

end

puts count_substring("th", "the three truths") # ==> 3 puts count_substring("abab", "ababababab") # ==> 2 puts count_substring("a*b", "abaabba*bbaba*bbab") # ==> 2


  1. The "split and count" method

def count_substring2(pattern:string, source:string)

   # the result of split() will contain one more element than the delimiter

# the "-1" second argument makes it not discard trailing empty strings

   source.split(Pattern.quote(pattern), -1).length - 1

end

puts count_substring2("th", "the three truths") # ==> 3 puts count_substring2("abab", "ababababab") # ==> 2 puts count_substring2("a*b", "abaabba*bbaba*bbab") # ==> 2


  1. This method does a match and counts how many times it matches

def count_substring3(pattern:string, source:string)

   result = 0
   Matcher m = Pattern.compile(Pattern.quote(pattern)).matcher(source);
   while (m.find())
       result = result + 1
   end
   result

end

puts count_substring3("th", "the three truths") # ==> 3 puts count_substring3("abab", "ababababab") # ==> 2 puts count_substring3("a*b", "abaabba*bbaba*bbab") # ==> 2 </lang>

NewLISP

<lang NewLISP>; file: stringcount.lsp

url
http://rosettacode.org/wiki/Count_occurrences_of_a_substring
author
oofoe 2012-01-29
Obvious (and non-destructive...)
Note that NewLISP performs an /implicit/ slice on a string or list
with this form "(start# end# stringorlist)". If the end# is omitted,
the slice will go to the end of the string. This is handy here to
keep removing the front part of the string as it gets matched.

(define (scount needle haystack)

 (let ((h (copy haystack)) ; Copy of haystack string.
       (i 0)               ; Cursor.
       (c 0))              ; Count of occurences.
   (while (setq i (find needle h))
     (inc c)
     (setq h ((+ i (length needle)) h)))
   c))                     ; Return count.
Tricky -- Uses functionality from replace function to find all
non-overlapping occurrences, replace them, and return the count of
items replaced in system variable $0.

(define (rcount needle haystack)

 (replace needle haystack "X") $0)
Test

(define (test f needle haystack)

 (println "Found " (f needle haystack) 
          " occurences of '" needle "' in '" haystack "'."))

(dolist (f (list scount rcount))

       (test f "glart" "hinkerpop")
       (test f "abab"  "ababababab")
       (test f "th"    "the three truths")
       (println)
       )

(exit)</lang>

Sample output:

Found 0 occurences of 'glart' in 'hinkerpop'.
Found 2 occurences of 'abab' in 'ababababab'.
Found 3 occurences of 'th' in 'the three truths'.

Found 0 occurences of 'glart' in 'hinkerpop'.
Found 2 occurences of 'abab' in 'ababababab'.
Found 3 occurences of 'th' in 'the three truths'.

OCaml

<lang ocaml>let count_substring str sub =

 let sub_len = String.length sub in
 let len_diff = (String.length str) - sub_len
 and reg = Str.regexp_string sub in
 let rec aux i n =
   if i > len_diff then n else
     try
       let pos = Str.search_forward reg str i in
       aux (pos + sub_len) (succ n)
     with Not_found -> n
 in
 aux 0 0

let () =

 Printf.printf "count 1: %d\n" (count_substring "the three truth" "th");
 Printf.printf "count 2: %d\n" (count_substring "ababababab" "abab");
</lang>

Pascal

See Delphi

Perl

<lang perl>sub countSubstring {

 my $str = shift;
 my $sub = quotemeta(shift);
 my $count = () = $str =~ /$sub/g;
 return $count;
  1. or return scalar( () = $str =~ /$sub/g );

}

print countSubstring("the three truths","th"), "\n"; # prints "3" print countSubstring("ababababab","abab"), "\n"; # prints "2"</lang>

Perl 6

<lang perl6>sub count-substring($big,$little) { +$big.comb: /$little/ }

say count-substring("the three truths","th"); say count-substring("ababababab","abab");</lang> Note that in Perl 6 the /$little/ matches the variable literally, so there's no need to quote regex metacharacters. Also, prefix + forces numeric context in Perl 6 (it's a no-op in Perl 5). One other style point: we now tend to prefer hyphenated names over camelCase.

PHP

<lang php><?php echo substr_count("the three truths", "th"), "\n"; // prints "3" echo substr_count("ababababab", "abab"), "\n"; // prints "2" ?></lang>

PicoLisp

<lang PicoLisp>(de countSubstring (Str Sub)

  (let (Cnt 0  H (chop Sub))
     (for (S (chop Str)  S  (cdr S))
        (when (head H S)
           (inc 'Cnt)
           (setq S (map prog2 H S)) ) )
     Cnt ) )</lang>

Test:

: (countSubstring "the three truths" "th")
-> 3

: (countSubstring "ababababab" "abab")
-> 2

PL/I

<lang PL/I> cnt: procedure options (main);

  declare (i, tally) fixed binary;
  declare (text, key) character (100) varying;
  get edit (text) (L); put skip data (text);
  get edit (key)  (L); put skip data (key);
  tally = 0; i = 1;
  do until (i = 0);
     i = index(text, key, i);
     if i > 0 then do; tally = tally + 1; i = i + length(key); end;
  end;
  put skip list (tally);

end cnt; </lang> Output for the two specified strings is as expected. Output for the following data:-

TEXT='AAAAAAAAAAAAAAA';
KEY='AA';
        7 

PowerBASIC

Works with: PB/Win
Works with: PB/CC

Windows versions of PowerBASIC (at least since PB/Win 7, and possibly earlier) provide the TALLY function, which does exactly what this task requires (count non-overlapping substrings).

PB/DOS can use the example under BASIC, above.

Note that while this example is marked as working with PB/Win, the PRINT statement would need to be replaced with MSGBOX, or output to a file. (PB/Win does not support console output.)

<lang powerbasic>FUNCTION PBMAIN () AS LONG

   PRINT "the three truths, th:", TALLY("the three truths", "th")
   PRINT "ababababab, abab:", TALLY("ababababab", "abab")

END FUNCTION</lang>

Output:

the three truths, th:        3
ababababab, abab:            2

PureBasic

<lang PureBasic>a = CountString("the three truths","th") b = CountString("ababababab","abab")

a = 3
b = 2</lang>

Python

<lang python>>>> "the three truths".count("th") 3 >>> "ababababab".count("abab") 2</lang>

Ruby

<lang ruby>irb(main):001:0> "the three truths".scan("th").length => 3 irb(main):002:0> "ababababab".scan("abab").length => 2</lang>

String#scan returns an array of substrings, and Array#length (or Array#size) counts them.

Seed7

<lang seed7>$ include "seed7_05.s7i";

const func integer: countSubstring (in string: stri, in string: searched) is func

 result
   var integer: count is 0;
 local
   var integer: offset is 0;
 begin
   offset := pos(stri, searched);
   while offset <> 0 do
     incr(count);
     offset := pos(stri, searched, offset + length(searched));
   end while;
 end func;

const proc: main is func

 begin
   writeln(countSubstring("the three truths", "th"));
   writeln(countSubstring("ababababab", "abab"));
 end func;</lang>

Output:

3
2

SNOBOL4

<lang SNOBOL4>

       DEFINE("countSubstring(t,s)")
       OUTPUT = countSubstring("the three truths","th")
       OUTPUT = countSubstring("ababababab","abab")
       :(END)

countSubstring t ARB s = :F(RETURN)

       countSubstring = countSubstring + 1 :(countSubstring)

END 3 2 </lang>

Tcl

The regular expression engine is ideal for this task, especially as the ***= prefix makes it interpret the rest of the argument as a literal string to match: <lang tcl>proc countSubstrings {haystack needle} {

   regexp -all ***=$needle $haystack

} puts [countSubstrings "the three truths" "th"] puts [countSubstrings "ababababab" "abab"] puts [countSubstrings "abaabba*bbaba*bbab" "a*b"]</lang>

Output:

3
2
2

TXR

<lang txr>@(next :args) @(do (defun count-occurrences (haystack needle)

      (for* ((occurrences 0)
             (old-pos 0)
             (new-pos (search-str haystack needle old-pos nil)))
            (new-pos occurrences)
            ((inc occurrences)
             (set old-pos (+ new-pos (length needle)))
             (set new-pos (search-str haystack needle old-pos nil))))))

@ndl @hay @(output) @(count-occurrences hay ndl) occurrences(s) of @ndl inside @hay @(end)</lang>

$ ./txr count-occurrences.txr "baba" "babababa"
2 occurence(s) of baba inside babababa
$ ./txr count-occurrences.txr "cat" "catapultcatalog"
2 occurence(s) of cat inside catapultcatalog