Last active
August 29, 2015 14:14
-
-
Save ashleymills/549ab8aff05ec90f4350 to your computer and use it in GitHub Desktop.
Fetch all lower case English language words from Wiktionary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
extension String { | |
func stringsBetween(fromTag: String, and toTag: String) -> [String]? { | |
let fromTagLen = countElements(fromTag) | |
let toTagLen = countElements(toTag) | |
var strings: [String]? | |
var startIndex = rangeOfString(fromTag, options: nil, range: rangeOfString(self))?.startIndex | |
while startIndex != nil { | |
for i in 0..<fromTagLen { | |
startIndex = startIndex!.successor() | |
} | |
var testRange = Range(start: startIndex!, end: self.endIndex) | |
var endIndex = rangeOfString(toTag, options: nil, range: testRange)?.startIndex | |
if endIndex != nil { | |
let string = substringWithRange(Range(start: startIndex!, end: endIndex!)) | |
if strings == nil { | |
strings = [string] | |
} else { | |
strings?.append(string) | |
} | |
for i in 0..<toTagLen { | |
endIndex = endIndex!.successor() | |
} | |
testRange = Range(start: endIndex!, end: self.endIndex) | |
} else { | |
return strings | |
} | |
startIndex = rangeOfString(fromTag, options: nil, range: testRange)?.startIndex | |
} | |
return strings | |
} | |
func isValidDictionaryString() -> Bool { | |
let nonLower = NSCharacterSet.lowercaseLetterCharacterSet().invertedSet | |
return rangeOfCharacterFromSet(nonLower) == nil | |
} | |
} | |
let baseURL = NSURL(string: "http://en.wiktionary.org/wiki/Index:English/") | |
let letters = Array("abcdefghijklmnopqrstuvwxyz") | |
let numbers = Array("12") | |
var error: NSError? | |
for letter in letters { | |
for number in numbers { | |
if let URL = baseURL?.URLByAppendingPathComponent(String(letter) + String(number)) { | |
if let response = String(contentsOfURL: URL, encoding: NSUTF8StringEncoding, error: &error) { | |
if let olStrings = response.stringsBetween("<ol>", and: "</ol>") { | |
for olString in olStrings { | |
if let liStrings = olString.stringsBetween("<li>", and: "</li>") { | |
for liString in liStrings { | |
if let wordStrings = liString.stringsBetween("<a href=\"/wiki/", and: "\" title") { | |
for wordString in wordStrings { | |
if wordString.isValidDictionaryString() { | |
println("\(wordString)") | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment