String matching is the problem of searching for a string inside a larger piece of text. Many programs require string matching including text editors and web browsers.
The C library provides the strstr function which performs string matching.
We will write our own solutions to the string matching problem sequentially, and also in parallel.
The following base C program reads in a text file, then asks the user what string to search for:
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
int string_search(char* data, char* target) {
/* TODO */
}
unsigned long ms_diff(struct timeval a, struct timeval b) {
unsigned long msa = (a.tv_sec * 1000) + (a.tv_usec / 1000);
unsigned long msb = (b.tv_sec * 1000) + (b.tv_usec / 1000);
return msb - msa;
}
int main(int argc, char** argv) {
FILE* file;
unsigned int length;
char* data;
char target[256];
struct timeval a, b;
/* get start time */
gettimeofday(&a, NULL);
/* open the file */
if (argc lt; 2) {
printf("Please pass an input file.\n");
return 0;
}
file = fopen(argv[1], "r");
if (!file) {
printf("Could not open %s for reading.\n", argv[1]);
return 0;
}
/* find the length of the file */
fseek(file, 0L, SEEK_END);
length = ftell(file);
fseek(file, 0L, SEEK_SET);
/* read the file into memory */
data = malloc(length * sizeof(char) + 1);
memset(data, 0, length);
fread(data, sizeof(char), length, file);
gettimeofday(&b, NULL);
unsigned long read = ms_diff(a, b);
printf("Reading took %lu ms.\n", read);
/* ask what should be searched for */
printf("Enter search term: ");
scanf("%s", target);
gettimeofday(&a, NULL);
/* now do the searching */
int index = string_search(data, target);
gettimeofday(&b, NULL);
unsigned long search = ms_diff(a, b);
printf("Searching took %lu ms.\n", search);
if (index != -1) {
printf("Found it at %d!\n", index);
} else {
printf("Not found...\n");
}
return 0;
}
We only need to complete the string_search function which should return the index of the target string, or -1 if it is not found.
How can we write a function which will solve this problem sequentially?
int string_search(char* data, char* target) {
int targ_length = strlen(target);
// for each starting index
for (int start = 0; start < (data_length - targ_length); start++) {
// for each one past this
int found = 1;
for (int letter = 0; letter < targ_length; letter++) {
if (data[start + letter] != target[letter]) {
// this start is not right
found = 0;
break;
}
}
if (found) {
return start;
}
}
return -1;
}
We can use the following files to test our program:
A "small" test case containing all of the works of William Shakespeare.
A "large" test case containing the first billion digits of Pi.
Amdahl's law tells us how much speedup we can expect, given how much of our program must be executed sequentially.
Note that reading a file the first time can be much slower than reading it again immediately afterwards. This is because the OS caches the data from the hard drive in memory making it much faster to access!
How can we parallelize this program?
struct node_t* string_search(char* data, char* target, int data_length) {
int targ_length = strlen(target);
/* the list of locations we found it at */
struct node_t* head = NULL;
// for each starting index
#pragma omp parallel for num_threads(8)
for (int start = 0; start < (data_length - targ_length); start++) {
// for each one past this
int found = 1;
for (int letter = 0; letter < targ_length; letter++) {
if (data[start + letter] != target[letter]) {
// this start is not right
found = 0;
break;
}
}
// check if it was found
if (found) {
#pragma omp critical
list_insert(&head, start);
}
}
return head;
}
Copyright © 2024 Ian Finlayson | Licensed under a Creative Commons BY-NC-SA 4.0 License.