[Sc-devel] regexp support revisited :)

Florian Schmidt mista.tapas at gmx.net
Sat Nov 3 22:54:40 PST 2007


On Saturday 03 November 2007, Florian Schmidt wrote:
> > e.g.
> > "sesame seeds".findRegExp("s.*e")
> > [[0, "se"], [2, "same"], [7, "see"]]
> >
> > This would give more flexibility (because of course it's easy to find
> > the lengths from the above result).
>
> Sure. I thought about this, too, for the same reason. But i was too lazy to
> look at how to create strings ;)

Ok, this version (see attachment or below) shows this behaviour besides some 
additional bugfixes ;) This version also behaves more like the String.find() 
in that it doesn't return the match position relative to the offset anymore 
but also relative to the start of the string..

"foobar".findRegexp("o*bar")
[ [ 1, oobar ] ]

"32424 334 /**aaaaaa*/".findRegexp("/\\*\\*a*\\*/")
[ [ 10, /**aaaaaa*/ ] ]

"foobar".findRegexp("(o*)(bar)")
[ [ 1, oobar ], [ 1, oo ], [ 3, bar ] ]

"aaaabaaa".findAllRegexp("a+")
[ [ [ 0, aaaa ] ], [ [ 1, aaa ] ], [ [ 2, aa ] ], [ [ 3, a ] ], [ [ 5, 
aaa ] ], [ [ 6, aa ] ], [ [ 7, a ] ] ]

Here's also some docs, which i don't know how to get into the Help file (using 
a html-editor?? - yuk!!)

I guess one could think about exposing some more features of the boost regex 
classes. Tell me what you think..

----------------------- SNIP

findRegexp(expression, offset)

POSIX regular expression matching using libboost. Returns a list of submatches 
or nil on error. A submach consists of a List [offset, match]. The first 
submatch simply consists of the matched strong of the whole expression, while 
concurrent submatches consist of the matches corresponding to subexpressions. 

Examples without subexpressions:

// Simple expression:
"foobar".findRegexp("o*bar")

// We have to escape special chars twice * -> \\*:
"32424 334 /**aaaaaa*/".findRegexp("/\\*\\*a*\\*/")

Examples with subexpressions (use brackets to mark subexpressions):

// This should make it obvious what happens :)
"foobar".findRegexp("(o*)(bar)")

findAllRegexp(expression, offset)

Iterates over the String to find all matches of a pattern (there might be some 
characters of the source string contained in more than one match).

Examples:

"aaaabaaa".findAllRegexp("a+")

----------------------- SNIP


Index: build/SCClassLibrary/Common/Collections/String.sc
===================================================================
--- build/SCClassLibrary/Common/Collections/String.sc	(revision 6504)
+++ build/SCClassLibrary/Common/Collections/String.sc	(working copy)
@@ -126,6 +126,23 @@
 	containsi { arg string, offset = 0;
 		^this.find(string, true, offset).notNil
 	}
+  
+  findRegexp { arg regexp, offset = 0;
+	_String_FindRegexp
+	^this.primitiveFailed
+  }
+	findAllRegexp { arg string, offset = 0;
+		var indices = [], i=[];
+		while { 
+			i = this.findRegexp(string, offset); 
+			i.notNil
+		}{
+			indices = indices.add(i);
+			offset = i[0][0] + 1;
+		}
+		^indices
+	}
+
 	find { arg string, ignoreCase = false, offset = 0;
 		_String_Find
 		^this.primitiveFailed
Index: Source/lang/LangPrimSource/PyrStringPrim.cpp
===================================================================
--- Source/lang/LangPrimSource/PyrStringPrim.cpp	(revision 6504)
+++ Source/lang/LangPrimSource/PyrStringPrim.cpp	(working copy)
@@ -40,6 +40,9 @@
 # include <regex.h>
 #endif
 
+#include <boost/regex.hpp>
+#include <string>
+
 int prStringAsSymbol(struct VMGlobals *g, int numArgsPushed);
 int prStringAsSymbol(struct VMGlobals *g, int numArgsPushed)
 {
@@ -178,6 +181,114 @@
 	return(0);
 }
 
+
+int prString_FindRegexp(struct VMGlobals *g, int numArgsPushed)
+{
+	int err;
+
+	PyrSlot *a = g->sp - 2; // source string
+	PyrSlot *b = g->sp - 1; // pattern
+	PyrSlot *c = g->sp;     // offset
+		
+	// std::cout << " num of args: " <<  g->numpop << std::endl;
+
+	if (!isKindOfSlot(b, class_string) || (c->utag != tagInt)) return 
errWrongType;
+
+	int offset = c->ui;
+
+	char *string = (char*)malloc(a->uo->size + 1);
+	err = slotStrVal(a, string, a->uo->size + 1);
+	if (err) return err;
+	
+	char *pattern = (char*)malloc(b->uo->size + 1);
+	err = slotStrVal(b, pattern, b->uo->size + 1);
+	if (err) return err;
+	
+
+	// std::cout << "input string: " << string << std::endl;
+	// std::cout << "     pattern: " << pattern << std::endl;
+
+	// std::cout << "      offset: " << offset << std::endl;	
+
+	std::string stringstring(string);
+	std::string::const_iterator start, end;
+
+	start = stringstring.begin() + offset;
+	end = stringstring.end();
+	
+	if (start >= end)
+	{
+		SetNil(a);
+		return errNone;
+	}
+
+	try
+	{
+		boost::match_results<std::string::const_iterator> what; 
+		boost::match_flag_type flags = boost::match_default; 
+	
+		boost::regex expression(pattern);
+
+		bool matched = boost::regex_search(start, end, what, expression, flags);
+
+		PyrObject *result_array = newPyrArray(g->gc, what.size(), 0, true);
+		result_array->size = 0;
+
+
+		if (matched)
+		{
+			for (size_t i = 0; i < what.size(); ++i)
+			{
+				if (what[0].matched == false)
+				{
+					result_array->size++;
+					SetNil(result_array->slots+i);
+				}
+				else
+				{
+					result_array->size++;
+	
+					int match_start =  what[i].first - start;
+					int match_length = what[i].second - what[i].first;
+
+					char *match = (char*)malloc(match_length + 1);
+					strncpy(match, string + offset + match_start, match_length);
+					match[match_length] = 0;
+	
+					PyrObject *array = newPyrArray(g->gc, 2, 0, true);
+					array->size = 2;
+	
+					SetInt(array->slots, match_start + offset);
+	
+					PyrObject *matched_string = (PyrObject*)newPyrString(g->gc, match, 0, 
true);
+					SetObject(array->slots+1, matched_string);
+	
+					SetObject(result_array->slots + i, array);
+				}
+			}
+		}
+		else
+		{
+			SetNil(a);
+			return errNone;
+		}
+	
+		SetObject(a, result_array);
+
+		return errNone;
+	}
+	catch (boost::regex_error e)
+	{
+		SetNil(a);
+		return errNone;
+	}
+
+	SetNil(a);
+
+	return errNone;
+}
+
+
 int prString_Regexp(struct VMGlobals *g, int numArgsPushed)
 {
 	int err, start, end;
@@ -622,11 +733,12 @@
 	definePrimitive(base, index++, "_String_AsFloat", prString_AsFloat, 1, 0);	
 	definePrimitive(base, index++, "_String_AsCompileString", 
prString_AsCompileString, 1, 0);	
 	definePrimitive(base, index++, "_String_Getenv", prString_Getenv, 1, 0);
-    definePrimitive(base, index++, "_String_Setenv", prString_Setenv, 2, 0);
-    definePrimitive(base, index++, "_String_Find", prString_Find, 4, 0);
+	definePrimitive(base, index++, "_String_Setenv", prString_Setenv, 2, 0);
+	definePrimitive(base, index++, "_String_Find", prString_Find, 4, 0);
 	definePrimitive(base, index++, "_String_FindBackwards", 
prString_FindBackwards, 4, 0);
-    definePrimitive(base, index++, "_String_Format", prString_Format, 2, 0);
+	definePrimitive(base, index++, "_String_Format", prString_Format, 2, 0);
 	definePrimitive(base, index++, "_String_Regexp", prString_Regexp, 4, 0);
+	definePrimitive(base, index++, "_String_FindRegexp", prString_FindRegexp, 3, 
0);
 	definePrimitive(base, index++, "_StripRtf", prStripRtf, 1, 0);
 	definePrimitive(base, index++, "_String_GetResourceDirPath", 
prString_GetResourceDirPath, 1, 0);
 	definePrimitive(base, index++, "_String_StandardizePath", 
prString_StandardizePath, 1, 0);	
Index: SConstruct
===================================================================
--- SConstruct	(revision 6504)
+++ SConstruct	(working copy)
@@ -790,7 +790,7 @@
                '#Headers/server',
                '#Source/lang/LangSource/Bison'],
     CPPDEFINES = [['USE_SC_TERMINAL_CLIENT', env['TERMINAL_CLIENT']]],
-    LIBS = ['common', 'scsynth', 'pthread', 'dl', 'm'],
+    LIBS = ['common', 'scsynth', 'pthread', 'dl', 'm', 'boost_regex'],
     LIBPATH = 'build'
     )
 if PLATFORM == 'darwin':


-- 
Palimm Palimm!
http://tapas.affenbande.org
-------------- next part --------------
A non-text attachment was scrubbed...
Name: sc-regexp-patch.diff
Type: text/x-diff
Size: 5547 bytes
Desc: not available
Url : http://www.create.ucsb.edu/pipermail/sc-devel/attachments/20071103/d78a4ae3/sc-regexp-patch-0001.bin


More information about the Sc-devel mailing list