Strip control codes and extended characters from a string: Difference between revisions
Line 768: | Line 768: | ||
next i |
next i |
||
END FUNCTION</lang> |
END FUNCTION</lang> |
||
<pre> |
|||
input : chr$(31)+"abc"+chr$(13)+"def"+chr$(11)+"ghi"+chr$(10) |
|||
output : abcdefghi</pre> |
|||
=={{header|Seed7}}== |
=={{header|Seed7}}== |
Revision as of 10:49, 1 April 2012
You are encouraged to solve this task according to the task description, using any language you may know.
The task is to strip control codes and extended characters from a string. The solution should demonstrate how to achieve each of the following results:
- a string with control codes stripped (but extended characters not stripped)
- a string with control codes and extended characters stripped
In ASCII, the control codes have decimal codes 0 through to 31 and 127 and greater than 126. On an ASCII based system, if the control codes are stripped, the resultant string would have all of its characters within the range of 32 to 126 decimal on the ascii table.
On a non-ASCII based system, we consider characters that do not have a corresponding glyph on the ASCII table (within the ASCII range of 32 to 126 decimal) to be an extended character for the purpose of this task.
AutoHotkey
<lang AHK>Stripped(x){ Loop Parse, x if Asc(A_LoopField) > 31 and Asc(A_LoopField) < 128 r .= A_LoopField return r } MsgBox % stripped("`ba" Chr(00) "b`n`rc`fd" Chr(0xc3))</lang>
C
<lang C>#include <stdio.h>
- include <stdlib.h>
- define IS_CTRL (1 << 0)
- define IS_EXT (1 << 1)
- define IS_ALPHA (1 << 2)
- define IS_DIGIT (1 << 3) /* not used, just give you an idea */
unsigned int char_tbl[256] = {0};
/* could use ctypes, but then they pretty much do the same thing */ void init_table() { int i;
for (i = 0; i < 32; i++) char_tbl[i] |= IS_CTRL; char_tbl[127] |= IS_CTRL;
for (i = 'A'; i <= 'Z'; i++) { char_tbl[i] |= IS_ALPHA; char_tbl[i + 0x20] |= IS_ALPHA; /* lower case */ }
for (i = 128; i < 256; i++) char_tbl[i] |= IS_EXT; }
/* depends on what "stripped" means; we do it in place.
* "what" is a combination of the IS_* macros, meaning strip if * a char IS_ any of them */
void strip(char * str, int what) { unsigned char *ptr, *s = (void*)str; ptr = s; while (*s != '\0') { if ((char_tbl[(int)*s] & what) == 0) *(ptr++) = *s; s++; } *ptr = '\0'; }
int main() { char a[256]; int i;
init_table();
/* populate string with one of each char */ for (i = 1; i < 255; i++) a[i - 1] = i; a[255] = '\0'; strip(a, IS_CTRL); printf("%s\n", a);
for (i = 1; i < 255; i++) a[i - 1] = i; a[255] = '\0'; strip(a, IS_CTRL | IS_EXT); printf("%s\n", a);
for (i = 1; i < 255; i++) a[i - 1] = i; a[255] = '\0'; strip(a, IS_CTRL | IS_EXT | IS_ALPHA); printf("%s\n", a);
return 0; }</lang>output:<lang> !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ <odd stuff my xterm thinks are bad unicode hence can't be properly shown>
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ !"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~</lang>
C++
<lang Cpp>#include <string>
- include <iostream>
- include <algorithm>
- include <boost/lambda/lambda.hpp>
- include <boost/lambda/casts.hpp>
- include <ctime>
- include <cstdlib>
using namespace boost::lambda ;
struct MyRandomizer {
char operator( )( ) { return static_cast<char>( rand( ) % 256 ) ; }
} ;
std::string deleteControls ( std::string startstring ) {
std::string noControls( " " ) ;//creating space for //the standard algorithm remove_copy_if std::remove_copy_if( startstring.begin( ) , startstring.end( ) , noControls.begin( ) ,
ll_static_cast<int>( _1 ) < 32 && ll_static_cast<int>( _1 ) == 127 ) ;
return noControls ;
}
std::string deleteExtended( std::string startstring ) {
std::string noExtended ( " " ) ;//same as above std::remove_copy_if( startstring.begin( ) , startstring.end( ) , noExtended.begin( ) ,
ll_static_cast<int>( _1 ) > 127 || ll_static_cast<int>( _1 ) < 32 ) ;
return noExtended ;
}
int main( ) {
std::string my_extended_string ; for ( int i = 0 ; i < 40 ; i++ ) //we want the extended string to be 40 characters long my_extended_string.append( " " ) ; srand( time( 0 ) ) ; std::generate_n( my_extended_string.begin( ) , 40 , MyRandomizer( ) ) ; std::string no_controls( deleteControls( my_extended_string ) ) ; std::string no_extended ( deleteExtended( my_extended_string ) ) ; std::cout << "string with all characters: " << my_extended_string << std::endl ; std::cout << "string without control characters: " << no_controls << std::endl ; std::cout << "string without extended characters: " << no_extended << std::endl ; return 0 ;
}</lang> Output:
string with all characters: K�O:~���7�5���� ���W��@>��ȓ�q�Q@���W- string without control characters: K�O:~���7�5���� ���W��@>��ȓ�q�Q@���W- string without extended characters: KO:~75W@>qQ@W-
Fortran
<lang fortran>module stripcharacters implicit none
contains
pure logical function not_control(ch) character, intent(in) :: ch not_control = iachar(ch) >= 32 .and. iachar(ch) /= 127 end function not_control
pure logical function not_extended(ch) character, intent(in) :: ch not_extended = iachar(ch) >= 32 .and. iachar(ch) < 127 end function not_extended
pure function strip(string,accept) result(str) character(len=*), intent(in) :: string character(len=len(string)) :: str interface pure logical function accept(ch) character, intent(in) :: ch end function except end interface integer :: i,n str = repeat(' ',len(string)) n = 0 do i=1,len(string) if ( accept(string(i:i)) ) then n = n+1 str(n:n) = string(i:i) end if end do end function strip
end module stripcharacters
program test
use stripcharacters character(len=256) :: string, str integer :: ascii(256), i forall (i=0:255) ascii(i) = i forall (i=1:len(string)) string(i:i) = achar(ascii(i)) write (*,*) string write (*,*) 'Control characters deleted:' str = strip(string,not_control) write (*,*) str
forall (i=1:len(string)) string(i:i) = achar(ascii(i)) write (*,*) 'Extended characters deleted:' write (*,*) strip(string,not_extended)
end program test </lang>
Go
Go works for ASCII and non-ASCII systems. The first pair of functions below interpret strings as byte strings, presumably useful for strings consisting of ASCII and 8-bit extended ASCII data. The second pair of functions interpret strings as UTF-8. <lang go>package main
import (
"fmt" "strings"
)
// two byte-oriented functions identical except for operator comparing c to 127. func stripCtlFromBytes(str string) string {
b := make([]byte, len(str)) var bl int for i := 0; i < len(str); i++ { c := str[i] if c >= 32 && c != 127 { b[bl] = c bl++ } } return string(b[:bl])
}
func stripCtlAndExtFromBytes(str string) string {
b := make([]byte, len(str)) var bl int for i := 0; i < len(str); i++ { c := str[i] if c >= 32 && c < 127 { b[bl] = c bl++ } } return string(b[:bl])
}
// two UTF-8 functions identical except for operator comparing c to 127 func stripCtlFromUTF8(str string) string {
return strings.Map(func(r rune) rune { if r >= 32 && r != 127 { return r } return -1 }, str)
}
func stripCtlAndExtFromUTF8(str string) string {
return strings.Map(func(r rune) rune { if r >= 32 && r < 127 { return r } return -1 }, str)
}
const src = "déjà vu" + // precomposed unicode
"\n\000\037 \041\176\177\200\377\n" + // various boundary cases "as⃝df̅" // unicode combining characters
func main() {
fmt.Println("source text:") fmt.Println(src, "\n") fmt.Println("as bytes, stripped of control codes:") fmt.Println(stripCtlFromBytes(src), "\n") fmt.Println("as bytes, stripped of control codes and extended characters:") fmt.Println(stripCtlAndExtFromBytes(src), "\n") fmt.Println("as UTF-8, stripped of control codes:") fmt.Println(stripCtlFromUTF8(src), "\n") fmt.Println("as UTF-8, stripped of control codes and extended characters:") fmt.Println(stripCtlAndExtFromUTF8(src))
}</lang> Output: (varies with display configuration)
source text: déjà vu � !~?�� as⃝df̅ as bytes, stripped of control codes: déjà vu !~��as⃝df̅ as bytes, stripped of control codes and extended characters: dj vu !~asdf as UTF-8, stripped of control codes: déjà vu !~��as⃝df̅ as UTF-8, stripped of control codes and extended characters: dj vu !~asdf
Icon and Unicon
We'll use deletec to remove unwanted characters (2nd argument) from a string (1st argument). The procedure below coerces types back and forth between string and cset. The character set of unwanted characters is the difference of all ASCII characters and the ASCII characters from 33 to 126. <lang Icon>procedure main(A) write(image(deletec(&ascii,&ascii--(&ascii)[33:127]))) end link strings </lang>
The IPL procedure deletec is equivalent to this: <lang Icon>procedure deletec(s, c) #: delete characters
result := "" s ? { while result ||:= tab(upto(c)) do tab(many(c)) return result ||:= tab(0) }
end</lang>
Output:
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}"
J
Solution: <lang j>stripControlCodes=: -.&(DEL,32{.a.) stripControlExtCodes=: ([ -. -.)&(32}.127{.a.)</lang> Usage: <lang j> mystring=: a. {~ ?~256 NB. ascii chars 0-255 in random order
#mystring NB. length of string
256
#stripControlCodes mystring NB. length of string without control codes
223
#stripControlExtCodes mystring NB. length of string without control codes or extended chars
95
#myunicodestring=: u: ?~1000 NB. unicode characters 0-999 in random order
1000
#stripControlCodes myunicodestring
967
#stripControlExtCodes myunicodestring
95
stripControlExtCodes myunicodestring
k}w:]U3xEh9"GZdr/#^B.Sn%\uFOo[(`t2-J6*IA=Vf&N;lQ8,${XLz5?D0~s)'Y7Kq|ip4<WRCaM!b@cgv_T +mH>1ejPy</lang>
Liberty BASIC
<lang lb>
all$ ="" for i =0 to 255 all$ =all$ +chr$( i) next i
print "Original string of bytes. ( chr$( 10) causes a CRLF.)" print all$ print
lessControl$ =controlStripped$( all$) print "With control codes stripped out." print lessControl$ print
lessExtendedAndControl$ =extendedStripped$( lessControl$) print "With extended codes stripped out too." print lessExtendedAndControl$
end
function controlStripped$( i$) r$ ="" for j =1 to len( i$) ch$ =mid$( i$, j, 1) if asc( ch$) >=32 then r$ =r$ +ch$ next j controlStripped$ =r$ end function
function extendedStripped$( i$) r$ ="" for j =1 to len( i$) ch$ =mid$( i$, j, 1) if asc( ch$) <=128 then r$ =r$ +ch$ next j extendedStripped$ =r$ end function
</lang>
Lua
<lang lua>function Strip_Control_Codes( str )
local s = "" for i in str:gmatch( "%C+" ) do s = s .. i end return s
end
function Strip_Control_and_Extended_Codes( str )
local s = "" for i = 1, str:len() do
if str:byte(i) >= 32 and str:byte(i) <= 126 then
s = s .. str:sub(i,i)
end
end return s
end
q = "" for i = 0, 255 do q = q .. string.char(i) end
print( Strip_Control_Codes(q) ) print( Strip_Control_and_Extended_Codes(q) )</lang>
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
Mathematica
<lang Mathematica>stripCtrl[x_]:=StringJoin[Select[Characters[x], MemberQ[CharacterRange["!","~"]~Join~Characters[FromCharacterCode[Range[128,255]]],#]&]]
stripCtrlExt[x_]:=StringJoin[Select[Characters[x], MemberQ[CharacterRange["!","~"],#]&]]</lang>
Test:
CompleteSet=FromCharacterCode[Range[0,255]] ->\.00\.02\.03\.04\.05\.06\.07\.08\.0b\.0e\.0f\.10\.11\.12\.13\.14 \.15\.16\.17\.18\.19\.1a\[RawEscape]\.1c\.1d\.1e\.1f !"#$%&'()*+,-./ 0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\] ^_`abcdefghijklmnopqrstuvwxyz{|}~? ¡¢£¤¥¦§¨©ª«\[Not]®¯\[Degree] \[PlusMinus]\.b2\.b3\.b4\[Micro]\[Paragraph]\[CenterDot]¸¹º»¼½¾¿ ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ*ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö/øùúûüýþÿ stripCtrl[CompleteSet] ->!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\] ^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«\[Not]®¯\[Degree] \[PlusMinus]\.b2\.b3\.b4\[Micro]\[Paragraph]\[CenterDot] ¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ*ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö /øùúûüýþÿ stripCtrlExt[CompleteSet] ->!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\] ^_`abcdefghijklmnopqrstuvwxyz{|}~
MATLAB / Octave
<lang MATLAB> function str = stripped(str)
str = str(31<str & str<127); end; </lang>
OCaml
<lang ocaml>let is_control_code c =
let d = int_of_char c in d < 32 || d = 127
let is_extended_char c =
let d = int_of_char c in d > 127
let strip f str =
let len = String.length str in let res = String.create len in let rec aux i j = if i >= len then String.sub res 0 j else if f str.[i] then aux (succ i) j else begin res.[j] <- str.[i]; aux (succ i) (succ j) end in aux 0 0
let () =
let len = 32 in let s = String.create len in Random.self_init(); for i = 0 to pred len do s.[i] <- char_of_int (Random.int 256) done; print_endline (strip is_control_code s); print_endline (strip (fun c -> (is_control_code c) || (is_extended_char c)) s);
- </lang>
Perl
<lang Perl>#!/usr/bin/perl -w use strict ;
my @letters ; my @nocontrols ; my @noextended ; for ( 1..40 ) {
push @letters , int( rand( 256 ) ) ;
} print "before sanitation : " ; print join( , map { chr( $_ ) } @letters ) ; print "\n" ; @nocontrols = grep { $_ > 32 && $_ != 127 } @letters ; print "Without controls: " ; print join( , map { chr( $_ ) } @nocontrols ) ; @noextended = grep { $_ < 127 } @nocontrols ; print "\nWithout extended: " ; print join( , map { chr( $_ ) } @noextended ) ; print "\n" ;</lang> Output:
before sanitation : �L08&YH�O��n)�:���O�G$���.���"zO���Q�?�� Without controls: �L08&YH�O��n)�:�O�G$���.���"zO��Q�?�� Without extended: L08&YHOn):OG$."zOQ?
Perl 6
<lang perl6>my $str = (0..400).roll(80)».chr.join;
say $str; say $str.subst(/<[ ^@..^_ ]>/, , :g); say $str.subst(/<-[ \ ..~ ]>/, , :g);</lang>
�¶ØèúđkƌĘ�r=êıƏÄÙÍy1SGa%TÑ�ęMRŅ�EŧİÌŬńĩµ9ŒďĔÜÉĈĬzijdś5FúŨƏźƅíýÛÃņGÏ ö~ƀRÑú ¶ØèúđkƌĘr=êıƏÄÙÍy1SGa%TÑęMRŅEŧİÌŬńĩµ9ŒďĔÜÉĈĬzijdś5FúŨƏźƅíýÛÃņGÏö~ƀRÑú kr=y1SGa%TMRE9zd5FG~R
PicoLisp
Control characters in strings are written with a hat (^) in PicoLisp. ^? is the DEL character. <lang PicoLisp>(de stripCtrl (Str)
(pack (filter '((C) (nor (= "^?" C) (> " " C "^A")) ) (chop Str) ) ) )
(de stripCtrlExt (Str)
(pack (filter '((C) (> "^?" C "^_")) (chop Str) ) ) )</lang>
Test:
: (char "^?") -> 127 : (char "^_") -> 31 : (stripCtrl "^I^M a b c^? d äöüß") -> " a b c d äöüß" : (stripCtrlExt "^I^M a b c^? d äöüß") -> " a b c d "
Pike
<lang Pike>> string input = random_string(100); > (string)((array)input-enumerate(32)-enumerate(255-126,1,127)); Result: "p_xx08M]cK<FHgR3\\I.x>)Tm<VgakYddy&P7"</lang>
PL/I
<lang PL/I> stripper: proc options (main);
declare s character (100) varying; declare i fixed binary;
s = 'the quick brown fox jumped'; /* A loop to replace blanks with control characters */ do i = 1 to length(s); if substr(s, i, 1) = ' ' then substr(s, i, 1) = '01'x; end; put skip list (s);
call stripcc (s); put skip list (s);
s = 'now is the time for all good men'; /* A loop to replace blanks with control characters */ do i = 1 to length(s); if substr(s, i, 1) = ' ' then substr(s, i, 1) = 'A1'x; end; put skip list (s);
call stripex (s); put skip list (s);
/* Strip control codes. */ stripcc: procedure (s);
declare s character (*) varying; declare w character (length(s)); declare c character (1); declare (i, j) fixed binary;
j = 0; do i = 1 to length (s); c = substr(s, i, 1); if unspec(c) >= '00100000'b | unspec(c) = '01111111'b then do; j = j + 1; substr(w, j, 1) = c; end; end; s = substr(w, 1, j);
end stripcc;
/* Strips control codes and extended characters. */ stripex: procedure (s);
declare s character (*) varying; declare w character (length(s)); declare c character (1); declare (i, j) fixed binary;
j = 0; do i = 1 to length (s); c = substr(s, i, 1); if unspec(c) >= '00100000'b & unspec(c) < '01111111'b then do; j = j + 1; substr(w, j, 1) = c; end; end; s = substr(w, 1, j);
end stripex;
end stripper; </lang> Output:
the�quick�brown�fox�jumped thequickbrownfoxjumped now¡is¡the¡time¡for¡all¡good¡men nowisthetimeforallgoodmen
PureBasic
<lang PureBasic>Procedure.s stripControlCodes(source.s)
Protected i, *ptrChar.Character, length = Len(source), result.s *ptrChar = @source For i = 1 To length If *ptrChar\c > 31 result + Chr(*ptrChar\c) EndIf *ptrChar + SizeOf(Character) Next ProcedureReturn result
EndProcedure
Procedure.s stripControlExtCodes(source.s)
Protected i, *ptrChar.Character, length = Len(source), result.s *ptrChar = @source For i = 1 To length If *ptrChar\c > 31 And *ptrChar\c < 128 result + Chr(*ptrChar\c) EndIf *ptrChar + SizeOf(Character) Next ProcedureReturn result
EndProcedure
If OpenConsole()
;create sample string Define i, s.s For i = 1 To 80 s + Chr(Random(254) + 1) ;include character values from 1 to 255 Next
PrintN(stripControlCodes(s)) ;string without control codes PrintN("---------") PrintN(stripControlExtCodes(s)) ;string without control codes or extended chars Print(#CRLF$ + #CRLF$ + "Press ENTER to exit"): Input() CloseConsole()
EndIf</lang> Sample output:
»╫=┐C─≡G(═ç╤â√╝÷╔¬ÿ▌x è4∞|)ï└⌐ƒ9²òτ┌ºáj)▓<~-vPÿφQ╨ù¿╖îFh"[ü╗dÉ₧q#óé├p╫■ --------- =CG(x 4|)9j)<~-vPQFh"[dq#p
Python
<lang Python>def stripped(x): return "".join([i for i in x if ord(i) in range(32, 127)])
print stripped("\ba\x00b\n\rc\fd\xc3")</lang>Output:<lang>abcd</lang>
REXX
version 1
This version is much faster, but much harder to understand what's happening. <lang rexx> /*REXX program to strip all "control codes" from a string (ASCII|EBCDIC)*/
xxx='string of ����?, may include control characters and other ilk.������' ebcdic=1=='f1'x /*is this an EBCDIC|ASCII system?*/
ccChars=xrange('0'x,d2c(c2d(' ')-1)) /*generate a range of characters.*/ if \ebcdic then ccChars=ccChars'7f'x /*add the ASCII '7f'X char. */
say 'hex ccChars =' c2x(ccChars)
yyy=translate(space(translate(xxx,'ff'x," "ccChars),0),,'ff'x) say 'old = >>>'xxx"<<<" say 'new = >>>'yyy"<<<" </lang> Output:
hex ccChars = 000102030405060708090A0B0C0D0E0F101112131415161718191A1B1C1D1E1F7F old = >>>string of ☺☻♥♦⌂, may include control characters and other ilk.♫☼§►↔◄<<< new = >>>string of , may include control characters and other ilk.<<<
version 2
A slower version, but much easier to understand the process. <lang rexx> /*REXX program to strip all "control codes" from a string (ASCII|EBCDIC)*/
xxx='string of ☺☻♥♦⌂, may include control characters and other ilk.♫☼§►↔◄' ebcdic=1=='f1'x /*is this an EBCDIC|ASCII system?*/
ccChars=xrange('0'x,d2c(c2d(' ')-1)) /*generate a range of characters.*/ if \ebcdic then ccChars=ccChars'7f'x /*add the ASCII '7f'X char. */
say 'hex ccChars =' c2x(ccChars) yyy=
do j=1 for length(xxx) _=substr(xxx,j,1) if pos(_,ccChars)\==0 then iterate /*skip this char, it's a no-no. */ yyy=yyy||_ end
say 'old = >>>'xxx"<<<"
say 'new = >>>'yyy"<<<"
</lang>
Output is identical to version 1.
Ruby
<lang ruby>class String
def strip_control_characters() self.chars.inject("") do |str, char| unless char.ascii_only? and (char.ord < 32 or char.ord == 127) str << char end str end end
def strip_control_and_extended_characters() self.chars.inject("") do |str, char| if char.ascii_only? and char.ord.between?(32,126) str << char end str end end
end
p s = "\ba\x00b\n\rc\fd\xc3\x7ffoo" p s.strip_control_characters p s.strip_control_and_extended_characters</lang>
outputs
"\ba\u0000b\n\rc\fd\xC3\u007Ffoo" "abcd\xC3foo" "abcdfoo"
Run BASIC
<lang runbasic>s$ = chr$(31) + "abc" + chr$(13) + "def" + chr$(11) + "ghi" + chr$(10) print strip$(s$)
' ----------------------------------------- ' strip junk ' ----------------------------------------- FUNCTION strip$(str$) for i = 1 to len(str$)
a$ = MID$(str$,i,1) a = ASC(a$) if a > 31 then if a < 123 then if a$ <> "'" then if a$ <> """" then strip$ = strip$ + a$ end if end if end if end if
next i END FUNCTION</lang>
input : chr$(31)+"abc"+chr$(13)+"def"+chr$(11)+"ghi"+chr$(10) output : abcdefghi
Seed7
Seed7 strings are UTF-32 encoded, therefore no destinction between BYTE and Unicode strings is necessary. The example below uses STD_UTF8_OUT from the library utf8.s7i, to write Unicode characters with UTF-8 encoding to the console.
<lang seed7>$ include "seed7_05.s7i";
include "utf8.s7i";
const func string: stripControl (in string: stri) is func
result var string: stripped is ""; local var integer: old_pos is 1; var integer: index is 0; var char: ch is ' '; begin for ch key index range stri do if ch < ' ' or ch = '\127\' then stripped &:= stri[old_pos .. pred(index)]; old_pos := succ(index); end if; end for; stripped &:= stri[old_pos ..]; end func;
const func string: stripControlAndExtended (in string: stri) is func
result var string: stripped is ""; local var integer: old_pos is 1; var integer: index is 0; var char: ch is ' '; begin for ch key index range stri do if ch < ' ' or ch >= '\127\' then stripped &:= stri[old_pos .. pred(index)]; old_pos := succ(index); end if; end for; stripped &:= stri[old_pos ..]; end func;
const string: src is "déjà vu\ # Unicode
\\n\0\\31\ \33\\126\\127\\128\\255\\n\ # Various boundary cases \as⃝df̅"; # Unicode combining characters
const proc: main is func
begin OUT := STD_UTF8_OUT; writeln("source text:"); writeln(src); writeln("Stripped of control codes:"); writeln(stripControl(src)); writeln("Stripped of control codes and extended characters:"); writeln(stripControlAndExtended(src)); end func;</lang>
Output:
source text: déjà vu � !~?ÿ as⃝df̅ Stripped of control codes: déjà vu !~ÿas⃝df̅ Stripped of control codes and extended characters: dj vu !~asdf
Tcl
<lang tcl>proc stripAsciiCC str {
regsub -all {[\u0000-\u001f\u007f]+} $str ""
} proc stripCC str {
regsub -all {[^\u0020-\u007e]+} $str ""
}</lang>