- 1: open https://lexica.art/
- 2: open browser console
- 3: paste in the JavaScript, hit enter
- 4: expand the output and copy it to a text file
- 5: optionally, repeat with multiple sets of pages
- 6: Save the
.cs
and.csproj
files together in a folder, open the.csproj
in Visual Studio - 7: Edit the
run(...
inputs to your file paths - 8: Run it
- 9: enjoy! Train a model off the content or something
Last active
January 27, 2023 10:27
-
-
Save mcmonkey4eva/176fa6fc9a121cd19117674dcd26b26e to your computer and use it in GitHub Desktop.
Lexica scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Javascript for your console | |
var out = ""; | |
for (var elem of document.getElementsByTagName('div')) { | |
if (elem.attributes[0] !== undefined && elem.attributes[0].nodeValue != 'gridcell') { | |
continue; | |
} | |
var aElem = elem.getElementsByTagName('a')[0]; | |
if (!aElem.href.startsWith('https://lexica.art/prompt/')) { | |
continue; | |
} | |
out += aElem.href; | |
var img = elem.getElementsByTagName('img')[0]; | |
out += " " + img.src; | |
for (var pElem of elem.getElementsByTagName('p')) { | |
out += " " + pElem.innerText; | |
} | |
out += "\n"; | |
} | |
console.log(out); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Net.Http; | |
using System.IO; | |
using System.Drawing; | |
using System.Drawing.Imaging; | |
using System.Threading.Tasks; | |
using FreneticUtilities.FreneticExtensions; | |
using FreneticUtilities.FreneticToolkit; | |
Run( | |
// PATH TO TEXT FILE HERE: | |
"./my-file.txt", | |
// PATH TO OUT DIR HERE: | |
"./out/" | |
); | |
/////////// | |
static void Run(string fileName, string outFolder) | |
{ | |
AsciiMatcher hex = new(AsciiMatcher.Digits + "abcdefABCDEF"); | |
HttpClient client = new(); | |
client.DefaultRequestHeaders.UserAgent.ParseAdd("Scrapey/1.0"); | |
string[] set = File.ReadAllText(fileName).Replace('\r', '\n').Split('\n', StringSplitOptions.RemoveEmptyEntries).Distinct().ToArray(); | |
int count = 0; | |
foreach (string url in set) | |
{ | |
try | |
{ | |
Console.WriteLine($"Load {count++} / {set.Length}..."); | |
string[] opts = url.Split(' ', 3); | |
if (opts.Length != 3) | |
{ | |
Console.WriteLine($"Ignore {url} as invalid"); | |
continue; | |
} | |
string prompt = opts[2]; | |
if (prompt.EndsWith("...")) | |
{ | |
prompt = prompt.BeforeLast(' '); | |
} | |
byte[] imageData = client.GetByteArrayAsync(opts[1]).Result; | |
if (imageData is null) | |
{ | |
Console.WriteLine($"Couldn't load image {url}"); | |
Task.Delay(1000).Wait(); | |
continue; | |
} | |
string uuid = hex.TrimToMatches(opts[1].AfterLast('/')); | |
using Image image = Image.FromStream(new MemoryStream(imageData)); | |
int wider = Math.Max(image.Width, image.Height); | |
float scale = wider > 768 ? 768f / wider : 1; | |
Bitmap clone = new Bitmap((int)(image.Width * scale), (int)(image.Height * scale), PixelFormat.Format24bppRgb); | |
using (Graphics gr = Graphics.FromImage(clone)) | |
{ | |
gr.DrawImage(image, new Rectangle(0, 0, clone.Width, clone.Height)); | |
} | |
clone.Save(outFolder + uuid + ".png", ImageFormat.Png); | |
File.WriteAllText(outFolder + uuid + ".txt", prompt); | |
} | |
catch (Exception ex) | |
{ | |
Console.WriteLine($"Couldn't load page {url} because error {ex}"); | |
Task.Delay(1000).Wait(); | |
continue; | |
} | |
} | |
Console.WriteLine("Done!"); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<Project Sdk="Microsoft.NET.Sdk"> | |
<PropertyGroup> | |
<OutputType>Exe</OutputType> | |
<TargetFramework>net6.0-windows</TargetFramework> | |
</PropertyGroup> | |
<ItemGroup> | |
<PackageReference Include="FreneticLLC.FreneticUtilities" Version="1.0.1" /> | |
<PackageReference Include="System.Drawing.Common" Version="7.0.0" /> | |
</ItemGroup> | |
</Project> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment