Skip to content

Commit

Permalink
Add paging support (closes #1)
Browse files Browse the repository at this point in the history
  • Loading branch information
Lustyn committed Jan 14, 2018
1 parent fd62ff7 commit 7105c4b
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 22 deletions.
2 changes: 1 addition & 1 deletion MainForm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ private void button1_Click(object sender, EventArgs e)
private void LoadContent()
{
ItemView.Items.Clear();
Task.Factory.StartNew(() => Scraper.Scrape(Category,0));
Task.Factory.StartNew(() => Scraper.Scrape(Category,1));
}

delegate void AddRowCallback(ListViewItem row);
Expand Down
87 changes: 66 additions & 21 deletions Scraper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ namespace TMF_Simplifier
{
class Scraper
{
private static int totalPages = 0;
private static int itemsScraped = 0;
private static int pagesScraped = 0;
private static ScrapingBrowser Browser = new ScrapingBrowser();

public async static void Scrape(int category, int page)
Expand All @@ -21,36 +24,78 @@ public async static void Scrape(int category, int page)
{
Browser.AllowAutoRedirect = true;
Browser.AllowMetaRedirect = true;
WebPage PageResult = await Browser.NavigateToPageAsync(new Uri("http://totalminerforums.net/index.php?action=downloads;cat="+category/*+";page="+page*/));
Console.WriteLine("Scraped");
WebPage PageResult = await Browser.NavigateToPageAsync(new Uri($"http://totalminerforums.net/index.php?action=downloads;cat={category};start={(page-1)*20}"));
HtmlNode Table = PageResult.Html.CssSelect(".table_grid").First();
foreach (var row in Table.SelectNodes("tr").Where(t => !t.Attributes.Contains("class")))
foreach (var row in Table.SelectNodes("tr"))
{
List<string> cells = new List<string>();
int celli = 0;
foreach (var cell in row.SelectNodes("td"))
if (!row.Attributes.Contains("class"))
{
if (celli < 5 || celli == 7)
List<string> cells = new List<string>();
int celli = 0;
foreach (var cell in row.SelectNodes("td"))
{
if (celli == 1 && cell.InnerText != "(None)")
if (celli < 5 || celli == 7)
{
cells.Add(cell.SelectNodes("img").Count + "/5");
}
else
{
cells.Add(HttpUtility.HtmlDecode(cell.InnerText));
}
if (celli == 1 && cell.InnerText != "(None)")
{
cells.Add(cell.SelectNodes("img").Count + "/5");
}
else
{
cells.Add(HttpUtility.HtmlDecode(cell.InnerText));
}

if (celli == 0)
{
string href = cell.SelectSingleNode("a").Attributes["href"].Value;
int index = href.IndexOf("down=") + 5;
TMFS.Ids[category].Add(int.Parse(href.Substring(index)));
if (celli == 0)
{
string href = cell.SelectSingleNode("a").Attributes["href"].Value;
int index = href.IndexOf("down=") + 5;
TMFS.Ids[category].Add(int.Parse(href.Substring(index)));
}
}
celli++;
}
TMFS.Instance.AddRow(new ListViewItem(cells.ToArray()));
itemsScraped++;
} else if (row.Attributes.Contains("class") && row.Attributes["class"].Value == "titlebg")
{
var htmlNodes = row.SelectNodes("td");//row.FirstChild.SelectNodes("a").Where(n => n.Attributes["class"].Value == "navPages");

foreach(HtmlNode node in htmlNodes)
{
if(node.SelectNodes("a").Count() > 0)
{
foreach (HtmlNode aNode in node.SelectNodes("a"))
{
if (aNode.Attributes.Contains("class") && aNode.Attributes["class"].Value == "navPages")
{
int maxpage;
if (int.TryParse(aNode.InnerText, out maxpage))
{
if (maxpage > totalPages)
{
totalPages = maxpage;
Console.WriteLine($"New max page: {maxpage}");
}
}
else
{
Console.WriteLine($"Failed to parse page #{node.InnerText}");
}
}
}
}
}
celli++;
}
TMFS.Instance.AddRow(new ListViewItem(cells.ToArray()));
}
pagesScraped++;
Console.WriteLine($"Got page {page}");

if(pagesScraped < totalPages)
{
Scrape(category, pagesScraped + 1);
} else
{
Console.WriteLine($"Finished scraping {itemsScraped} items and {pagesScraped} pages");
}
}
catch (Exception e)
Expand Down

0 comments on commit 7105c4b

Please sign in to comment.