Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 62 additions & 41 deletions core/src/main/scala/org/dbpedia/extraction/util/WikiInfo.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,68 +20,89 @@ class WikiInfo(val wikicode: String, val pages: Int)
object WikiInfo
{
val logger = Logger.getLogger(WikiInfo.getClass.getName)

// hard-coded - there probably is no mirror, and the format is very specific.
// TODO: user might want to use a local file...
// TODO: mayby change this to XML serialization
val URL = new URL("http://wikistats.wmflabs.org/api.php?action=dump&table=wikipedias&format=csv")
// Most browsers would save the file with this name, because s23.org returns it in a http header.

// Most browsers would save the file with this name
val FileName = "wikipedias.csv"

def fromFile(file: File, codec: Codec): Seq[WikiInfo] = {
val source = Source.fromFile(file)(codec)
try fromSource(source) finally source.close
}

def fromURL(url: URL, codec: Codec): Seq[WikiInfo] = {
val source = Source.fromURL(url)(codec)
try fromSource(source) finally source.close
}
def fromSource(source: Source): Seq[WikiInfo] = {

def fromSource(source: Source): Seq[WikiInfo] = {
fromLines(source.getLines)
}

/**
* Retrieves a list of all available Wikipedias from a CSV file like http://s23.org/wikistats/wikipedias_csv.php
*
*/
def fromLines(lines: Iterator[String]): Seq[WikiInfo] = {
* Retrieves a list of all available Wikipedias from a CSV file.
*/
def fromLines(lines: Iterator[String]): Seq[WikiInfo] = {
val info = new ArrayBuffer[WikiInfo]

if (! lines.hasNext) throw new Exception("empty file")
lines.next // skip first line (headers)

for (line <- lines)
if (line.nonEmpty)
fromLine(line) match{
case Some(x) => info += x
case None =>

if (!lines.hasNext) {
logger.warning("wikipedias.csv is empty")
return info
}

lines.next() // skip header

for (line <- lines) {
if (line.nonEmpty) {
fromLine(line) match {
case Some(wikiInfo) => info += wikiInfo
case None => // skip malformed line
}

}
}

info
}

/**
* Reads a WikiInfo object from a single CSV line.
* Malformed lines are logged and skipped.
*/
def fromLine(line: String): Option[WikiInfo] = {
val fields = line.split(",", -1)

if (fields.length < 15) throw new Exception("expected [15] fields, found ["+fields.length+"] in line ["+line+"]")

val pages = try fields(4).toInt
catch { case nfe: NumberFormatException => 0 }

val wikiCode = fields(2)
if (! ConfigUtils.LanguageRegex.pattern.matcher(fields(2)).matches) throw new Exception("expected language code in field with index [2], found line ["+line+"]")

//if(Language.map.keySet.contains(wikiCode))
Option(new WikiInfo(wikiCode, pages))
//else
//{
// logger.log(Level.WARNING, "Language: " + wikiCode + " will be ignored. Add this language to the addonlangs.json file to extract it.")
// None
//}

val fields = line.split(",", -1)

// 1️⃣ Validate field count
if (fields.length < 15) {
logger.warning(
s"Skipping malformed CSV line: expected 15 fields, found ${fields.length}. Line: [$line]"
)
return None
}

// 2️⃣ Parse pages safely
val pages =
try fields(4).toInt
catch {
case _: NumberFormatException =>
logger.warning(
s"Invalid page count in CSV line, defaulting to 0. Line: [$line]"
)
0
}

// 3️⃣ Validate language code
val wikiCode = fields(2)
if (!ConfigUtils.LanguageRegex.pattern.matcher(wikiCode).matches) {
logger.warning(
s"Invalid language code [$wikiCode] in CSV line, skipping. Line: [$line]"
)
return None
}

// 4️⃣ Valid line → create WikiInfo
Some(new WikiInfo(wikiCode, pages))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -148,31 +148,50 @@ abstract class ExtractionManager(

protected def loadMappingPages(language : Language) : Map[WikiTitle, WikiPage] =
{
val namespace = language.wikiCode match {
case "wikidata" =>
Namespace.mappings(Language.English)
case _ =>
Namespace.mappings.getOrElse(language, throw new NoSuchElementException("no mapping namespace for language "+language.wikiCode))
}
val namespaceOpt = language.wikiCode match {
case "wikidata" =>
Some(Namespace.mappings(Language.English))
case _ =>
Namespace.mappings.get(language)
}

if (namespaceOpt.isEmpty) {
logger.warning(
s"No mapping namespace for language ${language.wikiCode} – skipping mapping pages."
)
return Map.empty
}

val source = if (paths.mappingsDir != null && paths.mappingsDir.isDirectory)
{
val file = new File(paths.mappingsDir, namespace.name(Language.Mappings).replace(' ','_')+".xml")
if(!file.exists()) {
logger.warning("MAPPING FILE [" + file + "] DOES NOT EXIST! WILL BE IGNORED")
return Map[WikiTitle, WikiPage]()
}
logger.warning("LOADING MAPPINGS NOT FROM SERVER, BUT FROM LOCAL FILE ["+file+"] - MAY BE OUTDATED - ONLY FOR TESTING!")
XMLSource.fromFile(file, language) // TODO: use Language.Mappings?
}
else
{
val url = paths.apiUrl
WikiSource.fromNamespaces(Set(namespace), url, language) // TODO: use Language.Mappings?
val namespace = namespaceOpt.get

val source = if (paths.mappingsDir != null && paths.mappingsDir.isDirectory)
{
val file = new File(
paths.mappingsDir,
namespace.name(Language.Mappings).replace(' ','_') + ".xml"
)

if (!file.exists()) {
logger.warning(
"MAPPING FILE [" + file + "] DOES NOT EXIST! WILL BE IGNORED"
)
return Map.empty
}

source.map(page => (page.title, page)).toMap
logger.warning(
"LOADING MAPPINGS NOT FROM SERVER, BUT FROM LOCAL FILE [" + file +
"] - MAY BE OUTDATED - ONLY FOR TESTING!"
)

XMLSource.fromFile(file, language)
}
else
{
val url = paths.apiUrl
WikiSource.fromNamespaces(Set(namespace), url, language)
}

source.map(page => (page.title, page)).toMap
}

protected def loadOntology() : Ontology =
Expand Down
Loading