Created
February 17, 2015 04:03
-
-
Save ShadowKyogre/86881fe3e2b9ff00492b to your computer and use it in GitHub Desktop.
Written in Python 3. This scrapes article names that're all the OCG cards available in YGO so far. Will need to make this more modular to pull article names from any site's categories later.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from lxml import etree | |
| cards="http://yugioh.wikia.com/api.php?format=xml&action=query&list=categorymembers&cmtitle=Category:OCG_cards&cmprop=title&cmlimit=5000" | |
| ignore=set(["List of unnamed cards", "List of unseen cards"]) | |
| cmcontinue="" | |
| titles=[] | |
| querycount=1 | |
| while cmcontinue is not None: | |
| xmltree=None | |
| print("--- page {} ---".format(querycount)) | |
| if cmcontinue is not None and cmcontinue != "": | |
| xmltree=etree.parse("{}&cmcontinue={}".format(cards, cmcontinue)).getroot() | |
| elif cmcontinue == "": | |
| xmltree=etree.parse(cards).getroot() | |
| for title in xmltree.iter("cm"): | |
| if title.attrib["title"] in ignore: continue | |
| print(title.attrib["title"]) | |
| titles.append("{}\n".format(title.attrib["title"])) | |
| got_something = False | |
| for durp in xmltree.iter("query-continue"): | |
| for hurp in durp.iter("categorymembers"): | |
| cmcontinue = hurp.attrib["cmcontinue"] | |
| querycount+=1 | |
| got_something = True | |
| if not got_something: | |
| cmcontinue=None | |
| with open("articles.txt", mode='w', encoding="utf-8") as f: | |
| f.writelines(titles) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment